敲了那么多年的课本实例,总也没有学会编程。这次能参照实例,写了第一个自己觉得有点实际用途的程序,小小的激动一下!
我的成果:
我的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
url_one = ["http://sh.xiaozhu.com/search-duanzufang-p{}-0/".format(str(i)) for i in range(1,2)] #房源清单页面地址
#url = "http://sh.xiaozhu.com/fangzi/2943586863.html"
headers = {
"User_Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36',
"Cookie": 'abtest_ABTest4SearchDate=b; xzuuid=5ec2c267; _gat_UA-33763849-7=1; __utmt=1; OZ_1U_2282=vid=v77195a84c7816.0&ctime=1467062850<ime=1467062847; OZ_1Y_2282=erefer=-&eurl=http%3A//bj.xiaozhu.com/search-duanzufang-p1-0&etime=1467061671&ctime=1467062850<ime=1467062847&compid=2282; _ga=GA1.2.2126392000.1467061672; __utma=29082403.2126392000.1467061672.1467061672.1467061672.1; __utmb=29082403.14.9.1467062851049; __utmc=29082403; __utmz=29082403.1467061672.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
}
#获取房源详情页面地址
def get_url(url_pages,headers):
url_two = [] #房源详情页面地址清单
for url_page in url_pages:
wb_data = requests.get(url_page, headers=headers)
time.sleep(4)
soup = BeautifulSoup(wb_data.text, 'lxml')
url_ls = soup.select("#page_list > ul > li > a")
for url in url_ls:
url_contant = url.get("href")
url_two.append(url_contant)
return url_two
#获取房源详情
def get_attractions(url,headers):
wb_data = requests.get(url, headers = headers)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em")
addresses = soup.select("body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5")
rents = soup.select("#pricePart > div.day_l > span")
house_images = soup.select("#curBigImage")
landlord_images = soup.select("#floatRightBox > div.js_box.clearfix > div.member_pic > a > img")
#landlord_sexes = soup.select("")
landlord_names = soup.select("#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a")
for title, address, rent, house_image, landlord_image, landlord_name in zip(titles,addresses,rents,house_images,landlord_images,landlord_names):
title_contant = title.get_text()
address_contant = address.get_text()
rent_contant = rent.get_text()
house_image_contant = house_image.get("src")
landlord_image_contant = landlord_image.get("src")
#landlord_sexes_contant =
landlord_name_contant = landlord_name.get_text()
data = {
"title": title_contant,
"address": address_contant,
"rent": rent_contant,
"house_image": house_image_contant,
"landlord_imgae": landlord_image_contant,
"landlord_name": landlord_name_contant
}
return data
#主程序
#获取房源详情页面地址
urls = get_url(url_one,headers)
#获取房源详情
for url in urls:
time.sleep(4)
print(get_attractions(url,headers))
总结:
- 找出第一层网址的规律,装入列表中
- 从上述列表中的网址中找出第二层网址(每个房子详情页面)链接地址,装入列表中
- 从上述列表中爬取需要的信息,装入字典
- 输出爬取结果
(持续改进中……)