# 引入相关第三方库,requests负责网络请求,BeautifulSoup负责html解析,wget的download负责下载
import requests
from bs4 import BeautifulSoup
from wget import download
# 从浏览器F12的‘network’中获取User-Agent作为headers,以伪装成浏览器,保持爬虫稳定性(参考get-headers.png)。
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/86.0.4240.198 Safari/537.36 Edg/86.0.622.69'}
# 指定网址
url = r"https://bj.xiaozhu.com/"
# 获取网页html
web_data = requests.get(url, headers=headers).text
# 使用python自带的html.parser或者lxml来解析html
bs = BeautifulSoup(web_data, 'html.parser')
# 使用BeautifulSoup的选择器来获取指定的标签,返回的是Tag类型的ResultSet
imgs = bs.select('#page_list > ul > li:nth-child(1) > a > img')
# 遍历ResultSet,根据标签属性名获取属性内容,最后使用wget的download下载指定url对应的图片
for img in imgs:
download(img.get('lazy_src'))
------------------------------------------------------------------------------------------- by Silence @ 2020/11/19 -----------