从中国证券网 抓取午间公告。
中国证券网应该是权威部门,发布的消息比较靠谱,可以及时获取公告提前埋伏。
在学习这个的过程中,顺便把困扰我很久的编码问题给解决了,也算是个意外的收获。
以下是对代码的分析:
- 得到编辑器的默认编码
import locale
print locale.getdefaultlocale()[1]
- 建立目录,存放抓取的文件
sub_folder = os.path.join(os.getcwd(), "stock")
if not os.path.exists(sub_folder):
os.mkdir(sub_folder)
os.chdir(sub_folder)
- 定义下载文件的名字
temp_time = time.strftime("[%Y-%m-%d]-[%H-%M]", time.localtime())
store_filename = "StockNews-%s.log" % temp_time
fopen = codecs.open(store_filename, 'w', 'utf-8')
- 设置proxy
这个要具体情况具体分析,有的可能不需要proxy,我的公司是必须定义proxy,否则脚本不能读取网页的内容。
proxy_support = urllib2.ProxyHandler({"http":"http://your_proxy:8080/"})
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
- 解析网页
Outer HTML: <span class="time">07-19 13:00</span>
req = urllib2.Request(url=company_news_site, headers=headers)
resp = urllib2.urlopen(req)
html = resp.read()
#print chardet.detect(html)
soup = BS(html, "html.parser")
all_content = soup.find_all("span", "time")
for i in all_content:
news_time = i.string
node = i.next_sibling
title = node["title"].decode(sys.getdefaultencoding()).encode(locale.getdefaultlocale()[1])
print news_time," ", title, " ", node["href"]
代码如下:
#! /usr/bin/env python
#coding=utf-8
from bs4 import BeautifulSoup as BS
import random
import urllib2
import sys
import chardet
import time
import os
import codecs
import locale
print locale.getdefaultlocale()[1]
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding('utf-8')
print sys.getdefaultencoding()
sub_folder = os.path.join(os.getcwd(), "stock")
if not os.path.exists(sub_folder):
os.mkdir(sub_folder)
os.chdir(sub_folder)
temp_time = time.strftime("[%Y-%m-%d]-[%H-%M]", time.localtime())
store_filename = "StockNews-%s.log" % temp_time
fopen = codecs.open(store_filename, 'w', 'utf-8')
stock_news_site = "http://ggjd.cnstock.com/gglist/search/ggkx/"
my_userAgent = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)']
user_agent = random.choice(my_userAgent)
headers = {'User-Agent': user_agent, 'Host': "ggjd.cnstock.com",
'DNT': '1',
'Accept': 'text/html, application/xhtml+xml, */*', }
company_news_site = stock_news_site + "0"
print company_news_site
proxy_support = urllib2.ProxyHandler({"http":"your_proxy"})
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
req = urllib2.Request(url=company_news_site, headers=headers)
resp = urllib2.urlopen(req)
html = resp.read()
#print chardet.detect(html)
soup = BS(html, "html.parser")
all_content = soup.find_all("span", "time")
#print all_content
for i in all_content:
news_time = i.string
node = i.next_sibling
title = node["title"].decode(sys.getdefaultencoding()).encode(locale.getdefaultlocale()[1])
print news_time," ", title, " ", node["href"]
str_temp = "\n%s\t%s\n---> %s \n\n" % (news_time, node['title'], node['href'])
fopen.write(str_temp)
fopen.close()