包包配置
import requests
import ast
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'}
当网页可以找到相应的json数据的时候:
url = ""
# 在网页header页面拉到最下面然后找到postdata部分
postdata= {
}
import requests
# 请求
def get_html(url):
try:
r = requests.get(url, headers={'User-Agent':'Mozilla/5.0'}, timeout=30)
r.raise_for_status() # throw HTTPError if the status code is not 200
r.encoding = r.apparent_encoding # handling encoding issue
return r.text
except:
return "Error: something is Wrong!"
result = requests.post(url,data=postdata, headers=headers).text
# 对返回的string进行按照 自己的 需求进行清洗
result = result.replace('\n','')[4:]
result = result.replace('null,', '')
# 将list dic 类型的string转成 list dic
clean_result = ast.literal_eval(result)
print json.dumps(clean_result).decode("unicode-escape")
当需要用到解析网页的时候:
一般来说会用到Beautiful Soup: https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
BeautifulSoup库是用来解析、遍历、维护“标签树”的功能库(t.name,t.string,t.attrs)
另一种方法是RE: https://docs.python.org/3/library/re.html
def getMobileInfo(base_url, page):
result = []
for i in range(0,page,1):
# 拼接url
url = ''.join([base_url, str(i), '.html'])
html = requests.get(url, headers=headers).text
beautyHtml = BeautifulSoup(html, 'lxml')#解析式的一种,错误最少,最全的
#一般来说都会寻找相应的
div_list = beautyHtml.find_all('div', attrs={
'class': 'feeds-item'})
if div_list:
for item in div_list:
# 取得tag属性的值
href = item.div.h3.a['href']
# 取得tag的值
name = item.div.h3.a.text.encode('utf-8')
reg = '201\d年\d+月'
regResul = re.findall(reg, name, re.S)
if len(regResul) != 0:
print '___Name:', name
print '___ADD', href
result.append(href)
return result
## get all links in the page
link_list = [link.get('href') for link in soup.find_all('a')]
for link in link_list:
if link is not None and link[:4] == 'http': # Note: lazy evaluation
external_links.append(link)
[link for link in link_list if l is not None and l.startswith('http')]
网页的结构和re相关文档
各种tag等的含义:https://www.w3schools.com/tags/
大部分的浏览器会将HTML解析成DOM:https://www.w3.org/DOM/
正则表达式:https://regexone.com/ ,https://docs.python.org/3/howto/regex.html
正则表达式测试:http://regexr.com/
代理池使用:/wenshu_scrach/将文件夹和smilerequest.py引入即可(先cd到ip文件夹然后run起来)
定制好的函数
list抓取
def getListData(outputfile, *args):
url = ''.join(args)
print url
html = requests.get(url, headers=headers).text
beautyHtml = BeautifulSoup(html, 'lxml')
# 第三步:最重要的定制规则
# 搜索tag
tag = beautyHtml.find('div', attrs={'class': 'bottom'})
tags = beautyHtml.find_all('div')
tags = beautyHtml.find_all(id='html')
sub_tags = tag.find_all('div')
# 获取tag的值
tag_value = tag.div.string # type: bs4.element.NavigableString 也是一种unicode, 可以用str() 进行转换
tag_value = tag.head.text # type: bs4.element.NavigableString 也是一种unicode, 可以用str() 进行转换
# 获取property的值
property_value = tag['class'].string
if __name__ == '__main':
# 第一步:确定好输出文件
inputfliePath = '/Users/i309929/Desktop/input.txt'
outputfile = open(inputfliePath, 'w')
# 第二步:定制url
url = 'http://www.aqistudy.cn/historydata/index.php'
getListData(outputfile, url)
outputfile.close()
表格抓取
def getExcelFromWebTable(outputSheet, *keyword):
url = ''.join(keyword)
print url
html = requests.get(url, headers=headers).text
beautyHtml = BeautifulSoup(html, 'lxml')
tables = beautyHtml.find_all('table')
if tables:
print "一共找到 " + str(len(tables)) + " 表格"
print "默认选取第一个表格......."
first_table = tables[0]
trs = first_table.find_all('tr')
if trs:
row_count = len(trs)
# print "行数:" + str(row_count)
for i in xrange(0, row_count):
global start_row
tr = trs[i]
tds = tr.find_all('td')
if tds==None:
tds = tr.find_all('th')
col_count = len(tds)
# print "列数" + str(col_count)
for j in xrange(0, col_count):
td = tds[j]
text = td.text
outputSheet.write(start_row, j + 1, label=text)
start_row = start_row + 1
else:
print "表格tr中没有数据"
else :
print url + " 中没有表格格式数据"
if __name__ == '__main__':
'''
------------------------------------------------------------------------------------------------
从text文件中中每一行读取关键字变量进行遍历
'''
# inputFilePath = '/Users/i309929/Desktop/cities.txt'
# queryKeywords = open(inputFilePath, 'r')
#
# outputFile = ExcelWrite.Workbook(encoding='utf-8')
# outputSheet = outputFile.add_sheet("output_sheet", cell_overwrite_ok=True)
#
# for keyword in queryKeywords:
# global start_row
# start_row = start_row + 1
# outputSheet.write(start_row, 0, label=keyword)
# keyword = string.rstrip(keyword)
# print '-------------keyword: ' + str(keyword) + ' ----: '
# baseURL = 'http://www.aqistudy.cn/historydata/monthdata.php?city='
# getExcelFromWebTable(outputSheet, baseURL, str(keyword))
# sleep(1)
#
# queryKeywords.close()
# outputFile.save('/Users/i309929/Desktop/output.xls')
'''
------------------------------------------------------------------------------------------------
直接进行遍历
'''
outputFile = ExcelWrite.Workbook(encoding='utf-8')
outputSheet = outputFile.add_sheet("output_sheet", cell_overwrite_ok=True)
getExcelFromWebTable(outputSheet, 'http://tianqihoubao.com/weather/top/beijing.html')
outputFile.save('/Users/i309929/Desktop/output.xls')