我的代码
from bs4 import BeautifulSoup
path = './index.html'
with open(path, 'r') as f:
soup = BeautifulSoup(f.read(), 'lxml')
pics = soup.select('body > div > div > div.col-md-9 > div > div > div > img') #图片地址
prices = soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right') #价格
titles = soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a') #商品标题
rates = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)') #评分星级
counts = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right') #评分量
infos = []
for pic, price, title, rate, count in zip(pics, prices, titles, rates, counts):
info = {
'pic': pic.get('src'),
'price': price.get_text(),
'title': title.get_text(),
'rate': len(rate.find_all("span", "glyphicon glyphicon-star")),
'count': count.get_text()
}
infos.append(info)
print(info)
总结
- Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库
- Copy selector和Copy Xpath都是描述一个元素在网页中位置的方式
- find_all("p", "title")
# [<p class="title"><b>The Dormouse's story</b></p>]
返回的是CSS Class为”title”的"p"标签