从网站抓取标题及日期_005
import urllib
from bs4 import BeautifulSoup
url = "http://www.autohome.com.cn/list/c70-1.html"
def get_content_from_autohome(url_address):
html = urllib.urlopen(url_address)
content = html.read()
html.close()
soup = BeautifulSoup(content)
找到唯一的ID标签,找到所有的文章标题
all_title = soup.find_all('div', id = "ArticlesTitlesLeft")
for t in all_title:
print t.a['href'] //拿出所有文章链接
print t.a.string //提取所有标题
找到唯一的ID标签,找到所有的文章日期
all_date = soup.find_all('div', id = "ArticlesTitlesRigth")
for d in all_date:
print d.get_text() //提取所有日期
列表推导式表达
all_title = soup.find_all('div', id = "ArticlesTitlesLeft")
all_date = soup.find_all('div', id = "ArticlesTitlesRigth")
title_string = [t.a.string for t in all_title]
date_string = [d.get_text() for d in all_date]
组织成字典的形式
all_info = zip(title_string, date_string)
zip_to_dict = dict(all_info)
return zip_to_dict