1、环境
- pycharm,python3.5以上,requests,BeautifulSoup4,chrome
2、代码
import requests
from bs4 import BeautifulSoup
from urllib import parse
RQS_ID = '' # ***去手动复制第一个评论的id
ROOT_COMMENT_MAX_ID = ''
ROOT_COMMENT_MAX_ID_TYPE = ''
def get_con_page(nbr):
global RQS_ID, ROOT_COMMENT_MAX_ID, ROOT_COMMENT_MAX_ID_TYPE
headers = {
"Cookie": "" # ***去手动复制cookie
}
if nbr == 1:
res = requests.get(
f'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={RQS_ID}&from=singleWeiBo',
headers=headers
)
else:
res = requests.get(
f'https://weibo.com/aj/v6/comment/big'
f'?ajwvr=6&id={RQS_ID}'
f'&root_comment_max_id={ROOT_COMMENT_MAX_ID}'
f'&root_comment_max_id_type={ROOT_COMMENT_MAX_ID_TYPE}'
f'&root_comment_ext_param='
f'&page=%s&filter=hot'
f'&filter_tips_before=0&from=singleWeiBo' % nbr,
headers=headers
)
html = res.json()['data']['html']
soup = BeautifulSoup(html, 'html.parser')
m_con_list = soup.find_all('div', attrs={'node-type': 'replywrap'})
for m_con in m_con_list:
con_text = m_con.find('div', class_='WB_text').text.strip()
print(con_text)
action_data = soup.find('div', attrs={'node-type': 'comment_loading'})
if action_data is not None:
action_data = action_data['action-data']
else:
action_data = soup.find('a', attrs={'action-type': 'click_more_comment'})['action-data']
print(action_data)
parse_qs = parse.parse_qs(action_data)
RQS_ID = parse_qs['id'][0]
ROOT_COMMENT_MAX_ID = parse_qs['root_comment_max_id'][0]
ROOT_COMMENT_MAX_ID_TYPE = parse_qs['root_comment_max_id_type'][0]
for _nbr in range(1, 150):
print('第%d页' % _nbr)
get_con_page(_nbr)
print('-' * 120)
运行结果:
欢迎大家加入qq群一起交流爬虫技术:python爬虫技术交流群(494976303)