from bs4 import BeautifulSoup
with open('D:\Py\Plan-for-combating-master\week1/1_2/1_2answer_of_homework/index.html','r') as wb_data:
Soup=BeautifulSoup(wb_data,'lxml')
prices = Soup.select('body > div:nth-of-type(1) > div > div.col-md-9 > div:nth-of-type(2) > div > div > div.caption > h4.pull-right')
titles = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
reviews = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
images= Soup.select('body > div > div > div.col-md-9 > div > div > div > img')
rates = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
for price,title,review,image,rate in zip(prices,titles,reviews,images,rates):
data={
'price':price.get_text(),
'title':title.get_text(),
'review':review.get_text(),
'image':image.get('src'),
'rate': len(rate.find_all("span", "glyphicon glyphicon-star"))
}
print(data)
重点
- nth-child(1)&nth-of-type(1)的差别:
前者指的是父节点下的第一个元素
后者指的是该类型的第一个元素
不一定要严格用nth-of-type
可以放开范围查找find_all("标签“,"class")
-
两种文件读取方式
第一种
fs = open("文件地址”,’r')
print(fs.read())
fs.close
- 第二种
with open("文件地址“,‘r') as fs:
print(fs.read())