requests —— 高级用法

文件上传

假设网站需要上传文件

1
2
3
4
5
6
7
8
9
10
11
import requests

url = 'http://httpbin.org/post'
files = {'file': open('zhihu.ico','rb')}

response = requests.post(url,files=files)
print(response.content.decode('utf-8'))

'''
返回结果中,会包含files这个字段,文件上传部分会有单独的一个files字段来标识
'''

Cookies

获取cookies

1
2
3
4
5
6
7
8
9
10
11
12
13
import requests

url = 'https://www.baidu.com'

response = requests.get(url)
cookies = response.cookies # 调用属性,就可以获取cookies
print(cookies)
print(type(cookies)) # <class 'requests.cookies.RequestsCookieJar'>
print(cookies.items())

for key,value in cookies.items():
# items() 函数以列表返回可遍历的(键, 值) 元组数组
print(key + '=' + value)

设置cookies

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests

# 方法一
# 在请求头中设置cookies
url = 'https://www.zhihu.com/'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Cookie':'_zap=0df13f80-d87a-49b2-96cc-9d6d273f2885; _xsrf=pb7rDOJOaTLuBJHwyXnmXgN5xymEnBRq; d_c0="ACCnyLQ1Mw-PTt2OEfvgHr-AOdrHqMaxN48=|1553914668"; capsion_ticket="2|1:0|10:1553914672|14:capsion_ticket|44:NzFjMTUxOTIxNGI5NGQwM2E3MGY3YTdhZDQ2MmRjYjI=|660aaad910034e33b82ace8e93e26c42a8c1a1a516c69a7dfddf46aa0437d5fd"; z_c0="2|1:0|10:1553914680|4:z_c0|92:Mi4xVEFGbUFnQUFBQUFBSUtmSXREVXpEeVlBQUFCZ0FsVk5PQ21NWFFDcGFpT1VNbDFkcGJGN3JuUE1NVEUzdXh0Ql9n|d5e16e9f0689f70cde64fbaa325cf91a8c45560e39baa148de87afe1dcac42bc"; q_c1=ead1b63c011541af9c216e9547d4f107|1553914681000|1553914681000; __utma=51854390.919096901.1554105033.1554105033.1554105033.1; __utmb=51854390.0.10.1554105033; __utmc=51854390; __utmz=51854390.1554105033.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.100--|2=registration_date=20151221=1^3=entry_date=20151221=1; tgw_l7_route=116a747939468d99065d12a386ab1c5f; tst=r',
'referer': 'https://www.zhihu.com/'
}

response = requests.get(url,headers=headers)
print(response.status_code) # 200

# 方法二
# 设置cookies参数
cookies = '_zap=0df13f80-d87a-49b2-96cc-9d6d273f2885; _xsrf=pb7rDOJOaTLuBJHwyXnmXgN5xymEnBRq; d_c0="ACCnyLQ1Mw-PTt2OEfvgHr-AOdrHqMaxN48=|1553914668"; capsion_ticket="2|1:0|10:1553914672|14:capsion_ticket|44:NzFjMTUxOTIxNGI5NGQwM2E3MGY3YTdhZDQ2MmRjYjI=|660aaad910034e33b82ace8e93e26c42a8c1a1a516c69a7dfddf46aa0437d5fd"; z_c0="2|1:0|10:1553914680|4:z_c0|92:Mi4xVEFGbUFnQUFBQUFBSUtmSXREVXpEeVlBQUFCZ0FsVk5PQ21NWFFDcGFpT1VNbDFkcGJGN3JuUE1NVEUzdXh0Ql9n|d5e16e9f0689f70cde64fbaa325cf91a8c45560e39baa148de87afe1dcac42bc"; q_c1=ead1b63c011541af9c216e9547d4f107|1553914681000|1553914681000; __utma=51854390.919096901.1554105033.1554105033.1554105033.1; __utmb=51854390.0.10.1554105033; __utmc=51854390; __utmz=51854390.1554105033.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=51854390.100--|2=registration_date=20151221=1^3=entry_date=20151221=1; tgw_l7_route=116a747939468d99065d12a386ab1c5f; tst=r'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'referer': 'https://www.zhihu.com/'
}
# 构造RequestsCookieJar对象
jar = requests.cookies.RequestsCookieJar()

# 分割cookies
for cookie in cookies.split(';'):
key,value = cookie.split('=',1)
jar.set(key,value) # 利用set()方法设置好每个cookie的key和value

response = requests.get(url,cookies=jar,headers=headers)
print(response.status_code)

Session 会话维持

直接利用get或者post方法的确可以模拟登陆,但是实际上,每一次的get或者post相当于打开了不同的会话。例如当你第一次使用post()登陆了知乎后;第二次想获取登陆成功后的个人信息页面,实际上你是打开了一个新的会话窗口,与第一次会话无关,所以你无法获取个人信息页面,当然你可以在第二次的请求中携带cookies,但是每次请求都要携带十分的繁琐,这时候我们就可以利用Session来维持会话。Session对象会帮你维护一个会话,自动处理cookies。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests

url = 'https://www.zhihu.com'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3683.86 Safari/537.36',
'Cookie':'_zap=0df13f80-d87a-49b2-96cc-9d6d273f2885;',
'referer': 'https://www.zhihu.com/'
}
response = requests.get(url,headers=headers)
print(response.status_code)

url_userinfo = 'https://www.zhihu.com/settings/account'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'referer': 'https://www.zhihu.com/'
}
response = requests.get(url_userinfo,headers=headers)
print(response.status_code)

session = requests.Session()
response = session.get(url_userinfo,headers=headers)
print(response.status_code)

SSL 证书

当发送http请求的时候,会检查ssl证书,使用verify参数来控制是否检查这个证书,默认是True,会自动检验。

1
2
3
4
5
6
7
8
9
10
11
import requests

url = 'https://www.12306.cn'
response = requests.get(url,verfiy=False)
print(response.status_code) #200

# 它会给我们发送一个警告,我们还可以通过设置忽略警告来屏蔽他
from requests.packages import urllib3
urllib3.disable_warnings()
response = requests.get('https://www.12306.cn',verify=False)
print(response.status_code)

代理

大规模爬取数据的时候,需要设置代理,以免客户端直接封禁我们的ip。

1
2
3
4
5
6
7
8
9
10
import requests

url = 'https://www.taobao.com'
proxies = {
'http':'0.0.0.1:0000',
'https':'0.0.0.1:0000'
}

response = requests.get(url,proxies=proxies)
print(response.status_code)

超时设置

1
2
3
4
5
6
7
8
import requests

url = 'https://www.taobao.com'
response = requests.get(url,timeout= 0.1)
print(response.status_code)

response = requests.get(url,timeout=(5,30)) # timeout=(connect,read)
response = requests.get(url,timeout=None) # 永久等待 默认为None

身份认证

1
2
3
4
5
6
7
8
9
10
11
import requests

url = 'https://localhost:5000'

response = requests.get(url, auth=('username', 'password')) # 简写
'''
本质是:
from requests.auth import HTTPBasicAuth
response = requests.get(url,auth=HTTPBasicAuth('username','password'))
'''
print(response.status_code)

Prepared Request

讲请求表示为数据结构,其中各个参数都可以通过一个Request对象来表示。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from requests import Request,Session

url = 'http://httpbin.org/post'
data = {
'name':'David'
}
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
s = Session()
request = Request('POST',url,data=data,headers=headers)
prepped = s.prepare_request(request)
r = s.send(prepped)
print(r.status_code)