import requests
from lxml import html
http_headers = { 'Accept': '/','Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}
def get_real_url(url,try_count = 1):
if try_count > 3:
return url
try:
rs = requests.get(url,headers=http_headers,timeout=10)
if rs.status_code > 400:
return get_real_url(url,try_count+1)
return rs.url
except:
return get_real_url(url, try_count + 1)
def getContent(num):
url='http://kaoyan.eol.cn/html/ky/zyml/'
response=requests.get(url).content
selector=html.fromstring(response)
for x in selector.xpath("//span[@class='font10']/strong/text()"):
print '#'+x
temp_x=x[0:2]
for i in selector.xpath("//span[@class='font10']/text()"):
if(i[0:2]==temp_x):
print '##'+i
titleText=i[0:4]
for j in selector.xpath("//div[@align='left']/a/text()"):
tempText=j
if(j[0:4]==titleText):
print '###'+j
urll="https://baike.baidu.com/item/"
urlll="http://baike.baidu.com/search/word?word="
str=j[6:].strip()
try:
urlll=get_real_url(urlll+str)
respl=requests.get(urlll,headers=http_headers,allow_redirects=False).content
select=html.fromstring(respl)
h=select.xpath("//div[@class='para']/text()")
print h[0].encode("latin1").decode("utf8")
except:
print "null"
getContent(0)