import requests
from bs4 import BeautifulSoup
baseUrl = "https://book.douban.com/tag/?view=cloud"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
def get_books():
url = baseUrl
lists = []
html = requests.get(url,headers=headers)
soup = BeautifulSoup(html.content, "html.parser")
items = soup.find("table", "tagCol").find_all("tr")
for i in items:
lines = i.find_all("td")
for j in lines:
books = {}
books["title"]=j.a.string
books["num"]=j.b.string
lists.append(books)# 得到标签列表,一共120个热门标签
return lists
if __name__ == "__main__":
lists = get_books()
print(lists)
20170421按区爬取链家二手房
import pymysql
import requests
from bs4 import BeautifulSoup
import time
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
qu_lists = {'tianhe':'天河','yuexiu':'越秀','liwan':'荔湾','haizhu':'海珠','panyu':'番禺','baiyun':'白云','huangpugz':'黄埔','conghua':'从化','zengcheng':'增城','huadu':'花都','luogang':'萝岗','nansha':'南沙'}
def get_books(start):
#%d用作数字占位
url = "http://gz.lianjia.com/ershoufang/待选/pg%d/"
lists = []
for j in qu_lists:
baseUrl = url.replace('待选',j)
full_url = baseUrl % start
tag = qu_lists[j]
html = requests.get(full_url,headers=headers)
soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
items = soup.find("ul", "sellListContent").find_all("li") # 获取所有的房子内容
for i in items:
books = {} # 临时存取房产的数据
books["tag"] = tag # 天河
books["title"]=i.find("div","title").a.string #雅逸庭3房出售 单边位 带主套 免交个税
books["tagBlock"]=i.find("div","title").find("span","new tagBlock").text if(i.find("div","title").find("span","new tagBlock")) else "" #新上
books["link"]=i.find("div","title").a.get('href') #http://gz.lianjia.com/ershoufang/GZ0002556701.html
books["address_xiaoqu"]=i.find("div","houseInfo").a.string #广州雅居乐花园雅逸庭
books["address_info"]= i.find("div","houseInfo").a.next_sibling # | 3室2厅 | 109.5平米 | 东北 | 精装 | 无电梯
books["flood"]= i.find("div","flood").find("div","positionInfo").span.next_sibling #中楼层(共6层)2010年建塔楼 -
books["area"]= i.find("div","flood").find("div","positionInfo").a.string #华南
books["total_price"] = i.find("div","totalPrice").find("span").text # 总价
books["mean_price"] = i.find("div", "unitPrice").find("span").text # 均价
books["followInfo"]= i.find("div","followInfo").span.next_sibling #103人关注 / 共42次带看 / 29天以前发布
books["subway"]= i.find("div","tag").find("span","subway").text if(i.find("div","tag").find("span","subway")) else "" #距离1号线广州东站站338米
books["taxfree"]= i.find("div","tag").find("span","taxfree").text if(i.find("div","tag").find("span","taxfree")) else "" #距离1号线广州东站站338米
books["haskey"]= i.find("div","tag").find("span","haskey").text if(i.find("div","tag").find("span","haskey")) else ""#距离1号线广州东站站338米
lists.append(books) # 保存到返回数组中
print(books["area"])
time.sleep(3.5)
return lists
if __name__ == "__main__":
# 连接数据库,需指定charset否则可能会报错
db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS lianjia170421")# 如果表存在则删除
# 创建表sql语句
createTab = """CREATE TABLE lianjia170421(
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
tag VARCHAR(100) NOT NULL,
title VARCHAR(100) NOT NULL,
tagBlock VARCHAR(100) NOT NULL,
link VARCHAR(100) NOT NULL,
address_xiaoqu VARCHAR(100) NOT NULL,
address_info VARCHAR(100) NOT NULL,
flood VARCHAR(100) NOT NULL,
area VARCHAR(300) NOT NULL,
total_price VARCHAR(50) NOT NULL,
mean_price VARCHAR(100) NOT NULL,
followInfo VARCHAR(200) NOT NULL,
subway VARCHAR(100) NOT NULL,
taxfree VARCHAR(100) NOT NULL,
haskey VARCHAR(100) NOT NULL
)"""
cursor.execute(createTab)
for start in range(1,100,1):
lists = get_books(start)# 获取提取到数据
for i in lists:
# 插入数据到数据库sql语句,%s用作字符串占位
sql = "INSERT INTO `lianjia170421`(`tag`,`title`,`tagBlock`,`link`,`address_xiaoqu`,`address_info`,`flood`,`area`,`total_price`,`mean_price`,`followInfo`,`subway`,`taxfree`,`haskey`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
cursor.execute(sql, (i["tag"], i["title"], i["tagBlock"], i["link"], i["address_xiaoqu"], i["address_info"], i["flood"], i["area"], i["total_price"], i["mean_price"], i["followInfo"], i["subway"], i["taxfree"], i["haskey"]))
db.commit()
print(i["name"]+" is success")
except:
db.rollback()
time.sleep(3.5)
db.close()
为什么会报这个错。。
AttributeError: 'NoneType' object has no attribute 'find_all'
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 22 11:03:17 2018
@author: Rainey
"""
import requests
from bs4 import BeautifulSoup
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
data = {
'stageKey': '复赛',
'subKey': 8,
'stageType': 1
}
def get_html():
url = "http://codecraft.devcloud.huaweicloud.com/Home/TeamScoreDisplays"
html = None
try:
html = requests.post(url=url,headers=headers,data=data).text
except:
get_html()
return html
if __name__ == "__main__":
print(get_html())# 此处直接返回的是Response文件
soup = BeautifulSoup(get_html())# BeautifulSoup解析页面内容
items = soup.find_all("tr") # 获取所有的队伍内容
for i in items:
books = {}
books["team"]=i.find("td","first-td").string
print(books["team"])
books["score"]=i.find("td","score").string
print(books["score"])