import requestsfrom bs4 import BeautifulSoupimport reimport timeheaders={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Cookie':'bj58_id58s="SUhPQjZiVFhkZmhRNDMwNA=="; id58=c5/njVdgFfC1OWDkBL3zAg==; als=0; cookieuid=3b144215-0af2-4160-acda-c933c317cf5e; cookieuid1=c5/npldhQe628cUkAwb/Ag==; m58comvp=t15v115.159.229.23; br58_ershou=salev1_ershou; mcity=bj; mcityName=%E5%8C%97%E4%BA%AC; nearCity=%5B%7B%22cityName%22%3A%22%E5%8C%97%E4%BA%AC%22%2C%22city%22%3A%22bj%22%7D%5D; 58home=bj; myfeet_tooltip=end; ipcity=jz%7C%u664B%u4E2D; city=bj; sessionid=e1a173e8-1d0e-4599-ae67-87be5277713d; 58tj_uuid=434e0046-9e40-4619-bee2-da01d1591be7; new_session=0; new_uv=4; utm_source=; spm=; init_refer=; final_history=26266458742210%2C26266609472964%2C26074993844266%2C26147111979322%2C26251928992684; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=4', 'Referer':'http://bj.58.com/pbdn/pn2/?PGTID=0d305a36-0000-1084-dd93-0b8620717743&ClickID=3'}dict={}#获取访问量def get_counter(abc): id = abc.split('/')[-1].strip('x.shtml') api = 'http://jst1.58.com/counter?infoid={}'.format(id) js = requests.get(api, headers=headers) soup = BeautifulSoup(js.text, 'lxml') counters = soup.get_text().split('=')[-1] return(counters)#获取地址列def getlinks(values1): links_all=[] wb_data=requests.get(values1,headers=headers) soup=BeautifulSoup(wb_data.text,'lxml') links=soup.find_all(href=re.compile('http://bj.58.com/pingbandiannao/2')) for link_1 in links: links_all.append(link_1.get('href').split('?')[0]) return(links_all)#获取页内数据def get_countent(link): wb_data = requests.get(link,headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') time.sleep(1) titles=soup.title.text times=soup.select('.time') cates=soup.select('.crb_i > a') prices=soup.select('span.price') loctions=soup.select('.c_25d > a') vactions=get_counter(link) data={ 'title':titles, 'time':times[0].get_text(), 'cate':cates[1].get_text(), 'price':prices[0].get_text(), 'loction':loctions, 'vaction':vactions } print(data)urls =['http://bj.58.com/pbdn/pn{}/'.format(i) for i in range(1,10,1)]for item in urls: links=getlinks(item) for item1 in links: get_countent(item1)
第一周大作业,爬58同城!
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- This project is to crawl a big amount of web links using ...
- 结果如下: 程序源码: 经验总结:方式 学习了用STRIP()去除回车符号-学习了主函数的调用次序 学习了中文字符...
- 由韩沐依、崔航、金泽灏、左金珠领衔主演的大型网剧《我不上头条》正在厦门热拍中。该剧讲述娱乐圈“话题女王”林羽...