分成两部分,存储到mongodb和查找,较快完成,不过select的查找过长,下次争取用较短的元素定位。
我的成果
我的代码
from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient
client=MongoClient('localhost',27017)
xiaozhu=client['xiaozhu']
page_list=xiaozhu['page_list']
apartment=xiaozhu['apartment']
def get_links():
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,4)]
for url in urls:
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
links=soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname')
for link in links:
page_list.insert_one({'url':link.get('detailurl')})
def get_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
area = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
price = soup.select('#pricePart > div.day_l > span')
data = {
'title': title[0].get_text(),
'area': area[0].get_text() if soup.find_all('span','pr5') else None,
'price': price[0].get_text(),
}
apartment.insert_one(data)
print('done')
# get_links()
# for i in page_list.find():
# get_info(i['url'])
for item in apartment.find():
if int(item['price']) >=500:
print(item['title'],item['area'])
总结
- 存储价格的时候是字符型了,所以查找的时候需要转化
- 简化定位元素