作业1:在 MongoDB 中筛选房源
代码:
import pymongo
from bs4 import BeautifulSoup
import requests
client = pymongo.MongoClient('localhost',27017)
xiaozhu = client['xiaozhu']
info = xiaozhu['info']
def get_info (page=1):
for p in range(1,page+1):
url='http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(p)
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
titles=soup.select('span.result_title')
prices=soup.select('span.result_price > i')
for title,price in zip (titles,prices):
data={
'title':title.get_text(),
'price':int(price.get_text())
}
info.insert_one(data)
print('数据采集完成!')
#get_info(3)
for fang in info.find():
if fang['price'] >= 500:
print(fang)
总结:熟悉了Mongodb的安装,中间通过搜索解决了自启动的问题。学习了Mongodb的基本用法,插入数据,查询数据
作业2:爬取手机号
代码:
from bs4 import BeautifulSoup
import requests
import pymongo
import time
client = pymongo.MongoClient('localhost',27017)
tel = client['tel']
url_list = tel['url_list']
tel_list = tel['tel_list']
def get_tel (page=1):
for p in range(1,page+1):
url='http://wh.58.com/shoujihao/pn{}/'.format(str(p))
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text,'lxml')
time.sleep(1)
if soup.find('a','next'):
titles = soup.select('a.t > strong.number')
links = soup.select('a.t')
for title,link in zip (titles,links):
if link.get('href').rfind('wh.58') != -1:
data={
'title': title.get_text(),
'link': link.get('href').split('?')[0],
}
url_list.insert_one(data)
else:
pass
else:
print('爬不到了')
#get_tel(120)
def get_item_info ():
for item in url_list.find():
url=item['link']
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
data = {
'number':item['title'],
'price':soup.select('.price')[0].text.strip(),
}
tel_list.insert_one(data)
print('完成')
get_item_info()
总结:先爬地址,存数据再爬详细信息
作业三:设置断点续传
再上一个作业上小改
def get_item_info ():
db_urls = [item['link'] for item in url_list.find()]
index_urls = [item['url'] for item in tel_list.find()]
x = set(db_urls)
y = set(index_urls)
rest_urls = x - y
for url in rest_urls:
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
data = {
'number':item['title'],
'price':soup.select('.price')[0].text.strip(),
'url':url,
}
tel_list.insert_one(data)
print('完成')