有两个需要关注的点:
- BeautifulSoup的findall()方法
- 获取review的数量,有没有更简洁的方法?
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
goods_info = []
with open('/Users/kain/Documents/Course/Plan-for-combating-master/week1/1_2/1_2answer_of_homework/index.html', 'r') as web_data:
soup_obj = BeautifulSoup(web_data, 'lxml')
images = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > img')
titles = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
prices = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
# stars = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p > span')
# stars = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p')
stars = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings')
reviews = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
# print stars
for image, title, price, star, review in zip(images, titles, prices, stars, reviews):
data = {
"image": image.get('src'),
"title": title.get_text(),
"price": price.get_text(),
# 从Beautiful Soup的4.1.1版本开始,可以通过 class_ 参数搜索有指定CSS类名的tag
"star": len(star.find_all("span", class_="glyphicon glyphicon-star")),
"review": review.get_text()
}
# print data
goods_info.append(data)
# 获取3星,且评分量在30以上的商品的名称和价格
# int(str(each_goods['review'].split(' ')[0])),获取review的数量
for each_goods in goods_info:
if each_goods['star'] > 3 and int(str(each_goods['review'].split(' ')[0])) > 30:
print each_goods['title'], each_goods['price']