"""
@author:百草
@file:getEmail_note.py
@time:2021/09/06
"""
import requests
"""
A Requests session.
Provides cookie persistence, connection-pooling, and configuration.
Basic Usage::
>>> import requests
>>> s = requests.Session()
>>> s.get('https://httpbin.org/get')
<Response [200]>
Or as a context manager::
>>> with requests.Session() as s:
... s.get('https://httpbin.org/get')
<Response [200]>
"""
url = "https://www.jianshu.com/"
url = "https://www.baidu.com/"
s = requests.session()
s.get(url)
# print(s) # <requests.sessions.Session object at 0x00000182996ED1C0>
# print(s.__dict__) # s的属性和方法
# {
# 'headers': {'User-Agent': 'python-requests/2.24.0',
# 'Accept-Encoding': 'gzip, deflate',
# 'Accept': '*/*',
# 'Connection': 'keep-alive'},
# 'auth': None,
# 'proxies': {},
# 'hooks': {'response': []},
# 'params': {},
# 'stream': False,
# 'verify': True,
# 'cert': None,
# 'max_redirects': 30,
# 'trust_env': True,
# 'cookies': <RequestsCookieJar[] >,
# 'adapters': OrderedDict([('https://', < requests.adapters.HTTPAdapter object at 0x000001D13476DD30 >),
# ('http://', < requests.adapters.HTTPAdapter object at 0x000001D13476DD60 >)])
# }
"""
有时候我们需要爬取登录后才能访问的页面,这时我们就需要借助cookie
来实现模拟登陆和会话维持了。那么服务器是如何知道我们已经登录了呢? 当用户首次发送请求时,服务器端一般会生成并存储一小段信息,包含在response
数据里。
如果这一小段信息存储在客户端(浏览器或磁盘), 我们称之为cookie
。
如果这一小段信息存储在服务器端,我们称之为session(会话)
。
这样当用户下次发送请求到不同页面时,请求自动会带上cookie
,这样服务器就知道用户之前已经登录访问过了。
然而并不是访问所有的页面时服务器都会生成自动cookie
或session
。那么问题来了? 我们如何知道发送首次请求后服务器是否生成了cookie
呢? 这时我们可以直接通过打印response.cookies
来获取查看cookie
内容。
下例中当我们发送请求到简书时,我们可以看到返回的reponse里的cookies
是个空的RequestsCookieJar[]
,里面没有任何cookie
。
然而当我们发送请求到百度时,你可以看到百度已经生成了一个名为BDORZ
的cookie
,放在RequestsCookieJar[]
里了。你还可以通过打印
response.cookies['BAIDUID']
来打印BAIDUID
的内容。
'cookies': <RequestsCookieJar[Cookie(version=0, name='BDORZ', value='27315', port=None,
port_specified=False, domain='.baidu.com', domain_specified=True, domain_initial_dot=True,
path='/', path_specified=True, secure=False, expires=1631026778, discard=False, comment=None,
comment_url=None, rest={}, rfc2109=False)]>
参考:https://blog.csdn.net/weixin_42134789/article/details/82904741
# 方法1:直接设置cookies参数
# https://blog.csdn.net/weixin_42134789/article/details/82904741
cookies = dict(version=0, name='BDORZ', value='27315')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# s.get(url, headers=headers, cookies=cookies)
# 方法2:先实例化一个RequestCookieJar的类,然后把值set进去,最后在get,post方法里面指定cookies参数
from requests.cookies import RequestsCookieJar
cookies = RequestsCookieJar()
"""Compatibility class; is a cookielib.CookieJar, but exposes a dict
interface.
This is the CookieJar we create by default for requests and sessions that
don't specify one, since some clients may expect response.cookies and
session.cookies to support dict operations.
Requests does not use the dict interface internally; it's just for
compatibility with external client code. All requests code should work
out of the box with externally provided instances of ``CookieJar``, e.g.
``LWPCookieJar`` and ``FileCookieJar``.
Unlike a regular CookieJar, this class is pickleable.
.. warning:: dictionary operations that are normally O(1) may be O(n).
"""
print(cookies.__dict__)
# {'_policy': <http.cookiejar.DefaultCookiePolicy object at 0x0000022AF605BA30>,
# '_cookies_lock': <unlocked _thread.RLock object owner=0 count=0 at 0x0000022AF605B6C0>,
# '_cookies': {}}