作业一
对小猪短租爬到的前300条数据进行筛选,找出价格大于等于500的
完整代码
from bs4 import BeautifulSoup
import requests,time,random,pymongo
client = pymongo.MongoClient('localhost',27017)
spider = client['spider']
xiaozhuduanzu = spider['xiaozhuduanzu']
# 爬取项的链接地址
def item_link_list(page):
data = []
for i in range(1,page+1):
ti = random.randrange(1,4)
time.sleep(ti)
url = 'http://sh.xiaozhu.com/search-duanzufang-p{}-0/'.format(i)
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
urls = imgs = soup.select('ul.pic_list.clearfix > li > a')
prices = soup.select('div.result_btm_con.lodgeunitname > span > i')
titles = soup.select('div.result_btm_con.lodgeunitname > div.result_intro > a > span')
for title,url,price,img in zip(titles,urls,prices,imgs):
da = {
'title' : title.get_text(),
'url' : url.get('href'),
'price' : price.get_text(),
}
data.append(da)
return data
# 对房东性别进行判断
def returnSex(sexclass):
if sexclass == 'member_ico':
return '男'
if sexclass == 'member_ico1':
return '女'
# 爬去详情页数据
def item_detail(url):
wd_data = requests.get(url)
soup = BeautifulSoup(wd_data.text,'lxml')
title = soup.select('div.pho_info > h4 > em')[0].get_text()
address = soup.select('div.pho_info > p > span.pr5')[0].get_text()
price = soup.select('div.day_l > span')[0].get_text()
img = soup.select('#curBigImage')[0].get('src')
host_img = soup.select('div.member_pic > a > img')[0].get('src')
host_sex = soup.select('div.member_pic > div')[0].get('class')[0]
host_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get_text()
data = {
'title': title,
'address': address.strip().lstrip().rstrip(','),
'price': price,
'img': img,
'host_img': host_img,
'ownersex': returnSex(host_sex),
'ownername': host_name
}
xiaozhuduanzu.insert_one(data)
# 对数据进行筛选
for i in xiaozhuduanzu.find():
if int(i['price']) >= 500:
print(i)
结果为
作业二
爬取58同城中所有的手机号
爬取列表项
def get_phone_list(who_sells,page):
for i in range(85,page+1):
r = random.randrange(6,10)
time.sleep(r)
url = 'http://bj.58.com/shoujihao/{}/pn{}'.format(who_sells,i)
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = None
urls = None
if i == 1:
titles = soup.select('div.boxlist > ul > div:nth-of-type(2) > ul > li > a.t > strong')
urls = soup.select('div.boxlist > ul > div:nth-of-type(2) > ul > li > a.t')
else:
titles = soup.select('div.boxlist > ul > div:nth-of-type(1) > ul > li > a.t > strong')
urls = soup.select('div.boxlist > ul > div:nth-of-type(1) > ul > li > a.t')
if len(titles) == 0:
print('到第' + str(i) + "结束")
break;
for title, url in zip(titles, urls):
url = url.get('href').split('?')[0]
if 'bj.58.com' in str(url):
data = {
'title': title.get_text(),
'url': url
}
phoneitem.insert_one(data)
else:
pass
print('到第' + str(i))
根据url爬取详情页信息
def get_detail_phone(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
title = soup.select('div.col_sub.mainTitle > h1')[0].get_text().replace(' ','').replace('\n','').replace('\t','').strip().lstrip().rstrip(',')
price = soup.select('span.price.c_f50')[0].get_text().strip().lstrip().rstrip(',')
area = list(soup.select('div.su_con')[1].stripped_strings)
maijia = soup.select('ul.vcard > li > a')[0].get_text()
maijia_link = soup.select('#t_phone')[0].get_text().strip().lstrip().rstrip(',')
data={
'title' : title,
'price' : price,
'area' : area,
'maijia' : maijia,
'maijia_link' : maijia_link
}
print(data)
phonedetail.insert_one(data)
总结
由于这次爬取的数据量比较大,58也有反爬取的机制,所以我尝试让等待的时间增加来保证爬取的成功率。