代码用的python2.7,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。下面直接上代码
```
#-*-encoding=utf-8-*-
importrequests
fromlxmlimportetree
importtime
importpymongo
frommultiprocessingimportPool
classGetproxy(object):
def__init__(self):
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
self.url ='http://www.xicidaili.com/wt/'
self.client = pymongo.MongoClient('localhost',27017)
self.xici =self.client['xici']
self.xiciipinfo =self.xici['xiciipinfo']
#self.removeip = '127.0.0.1' #第一次运行会检测该变量,因为下面只有检测失败了才会赋值
defgetip(self,num):
#爬西祠所有代理,更新放入数据库
url =self.url +str(num)
wb_data = requests.get(url,headers=self.headers)
html = etree.HTML(wb_data.text)
# htmls = etree.tostring(html)
ips = html.xpath('//tr[@class="odd"]/td[2]/text()')
ports = html.xpath('//tr[@class="odd"]/td[3]/text()')
protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')
areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')
forip,port,protocol,areainzip(ips,ports,protocols,areas):
data = {
'ip': ip,
'port': port,
'protocol': protocol,
'area': area,
}
printdata
#self.xiciipinfo.insert_one(data)
#if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间
self.xiciipinfo.update({'ip':ip},{'$set':data},True)
defcount(self,num):
foriinrange(1,num):
self.getip(i)
time.sleep(2)
defdbclose(self):
self.client.close()
defgetiplist(self):
#将数据库内数据整理放入列表
ips =self.xiciipinfo.find()
proxylist = []
foriinips:
b ="http"+"://"+ i['ip'] +":"+ i['port']
proxies = {"http": b}
# print proxies
proxylist.append(proxies)
# print proxylist
returnproxylist
defiptest(self,proxy):
#检测ip,并更新进入数据库,删掉不可用的ip
ip = proxy['http'][7:].split(':')[0]
try:
requests.get('http://wenshu.court.gov.cn/',proxies=proxy,timeout=6)
except:
print'field...............>>>>>>>>>>>>>>>>>>>>>>>>'
#self.removeip = ip #赋值给类属性
self.xiciipinfo.remove({'ip': ip})#用remove方法,将符合条件的删掉
print'remove it now.....{}'.format(ip)
else:
print'<<<<<<<<<<<<<<<<<.............success'
printproxy
if__name__ =='__main__':
pool = Pool()
proxy = Getproxy()
proxy.count(2)
iplist = proxy.getiplist()
map(proxy.iptest,iplist)
proxy.dbclose()
```