1.先看效果
2.思路
反爬虫,武装user-agent
3.上源代码
import re
import requests
from bs4 import BeautifulSoup
class Guiyang(object):
def __init__(self):
self.page = range(1,10)
self.url = 'http://gy.58.com/zhaozu/?PGTID=0d00000d-0000-0ee8-d8e7-f5dce12e009e&ClickID={}'.format(self.page)
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
,'Host':'gy.58.com'
,'Upgrade-Insecure-Requests':'1'
}
self.link_url = self.get_data()
def get_data(self):
data = {
'PGTID':'0d00000d-0000-0ee8-d8e7-f5dce12e009e'
,'ClickID':'2'
}
r = requests.get(url=self.url,headers=self.headers,data=data).text
s = BeautifulSoup(r,'lxml').find('a',class_='on').get_text()
#print(s)
soup = BeautifulSoup(r,'lxml').find('ul',class_='house-list-wrap').find_all('li')
for items in soup:
link_url = items.find('a')['href'] #每个url的链接
#get_link = requests.get(item_link_url,headers=headers).text
name = items.find('span',attrs={'class':'title_des'}).get_text()
location =items.find('p',class_='baseinfo').get_text().replace('\n','')
#pricea = items.find('p',class_='sum').get_text().replace('\n','')+str('>每平米')+'\n\n'
try:
pricetoday = items.find('p',class_='unit').get_text().replace(' ','').replace('\n','').replace('\r','')
print('{},{},{}'.format(pricetoday,location,name))
except:
pass
c = Guiyang()
c.get_data()