作用及类型
1、针对于需要web客户端用户名密码认证的网站
2、auth = ('username','password')
import requests
from lxml import etree
class NoteSpider(object):
def __init__(self):
self.url = "http://code.tarena.com.cn/"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
#定义web客户端验证参数auth
self.auth = ("tarenacode","code_2013")
def get_code(self):
html = requests.get(url = self.url,auth = self.auth,headers = self.headers).content.decode("utf-8")
#解析提取数据
parse_html = etree.HTML(html)
r_list = parse_html.xpath("//a/@href")
print(r_list[1:])
if __name__ == '__main__':
n = NoteSpider()
n.get_code()
把课件下载到本地
import requests
from lxml import etree
import os
class NoteSpider(object):
def __init__(self):
self.url = 'http://code.tarena.com.cn/AIDCode/aid1909/16_spider/'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
self.auth = ('tarenacode','code_2013')
self.directory = '/home/tarena/code/' + '/'.join(self.url.split('/')[3:])
if not os.path.exists(self.directory):
os.makedirs(self.directory)
def get_html(self):
html = requests.get(url=self.url,auth=self.auth,headers=self.headers).text
p = etree.HTML(html)
r_list = p.xpath('//a/text()')
for r in r_list:
if r.endswith('.zip') or r.endswith('.rar'):
self.download_file(r)
def download_file(self,r):
file_url = self.url + r
html = requests.get(url=file_url,auth=self.auth,headers=self.headers).content
filename = self.directory + r
with open(filename,'wb') as f:
f.write(html)
print(filename,'下载成功')
if __name__ == '__main__':
spider = NoteSpider()
spider.get_html()