Beautifulsoup 的中文文档,及里面find_all() 函数用法
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#find-all我的代码
from bs4 import BeautifulSoup
path = 'C:/Users/Google/Desktop/web/index.html'with open(path, 'r') as wb_data:
soup = BeautifulSoup(wb_data, 'lxml') titles = soup.select('body > div.container > div.row > div.col-md-9 > div.row > div.col-sm-4.col-lg-4.col-md-4 > div.thumbnail > div.caption > h4 > a') images = soup.select('body > div.container > div.row > div.col-md-9 > div.row > div.col-sm-4.col-lg-4.col-md-4 > div.thumbnail > img') reviews = soup.select('div.ratings > p.pull-right') prices = soup.select('div.caption > h4.pull-right') stars = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')for title, image, review, price, star in zip(titles, images, reviews, prices, stars): data = { 'title': title.get_text(), 'image': image.get('src'), 'review': review.get_text(), 'price': price.get_text(), 'star': len(star.find_all("span", class_='glyphicon glyphicon-star')) } print(data)
- 需要注意的点
'star': len(star.find_all("span", class_='glyphicon glyphicon-star'))
这句话的 作用是需要在
这里面 计算出 标签是class 值 等于 glyphicon glyphicon-star 的个数。
from bs4 import BeautifulSoup
path = 'C:/Users/Google/Desktop/web/index.html'
with open(path, 'r') as wb_data:
soup = BeautifulSoup(wb_data, 'lxml')
titles = soup.select('body > div.container > div.row > div.col-md-9 > div.row > div.col-sm-4.col-lg-4.col-md-4 > div.thumbnail > div.caption > h4 > a')
images = soup.select('body > div.container > div.row > div.col-md-9 > div.row > div.col-sm-4.col-lg-4.col-md-4 > div.thumbnail > img')
reviews = soup.select('div.ratings > p.pull-right')
prices = soup.select('div.caption > h4.pull-right')
stars = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
print (stars)
for title, image, review, price, star in zip(titles, images, reviews, prices, stars):
data = {
'title': title.get_text(),
'image': image.get('src'),
'review': review.get_text(),
'price': price.get_text(),
'star': len(star.find_all("span", class_='glyphicon glyphicon-star'))
}
print(data)