python版本:3.5
爬取目标网址:
https://www.hellobi.com/
源代码
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TianshanItem(scrapy.Item):
# define the fields for your item here like:
#课程名称
name = scrapy.Field()
#课程链接
link = scrapy.Field()
#学员数
stu = scrapy.Field()
lessons.py
# -*- coding: utf-8 -*-
import scrapy
from tianshan.items import TianshanItem
class LessionSpider(scrapy.Spider):
name = "lesson"
allowed_domains = ["hellobi.com"]
start_urls = ['https://edu.hellobi.com/course/1']
def parse(self, response):
item = TianshanItem()
item["name"] = response.xpath("//div[@class='course-info']/h1/text()").extract()[0]
item["link"] = response.xpath("//ul[@class='nav nav-tabs' and @role='tablist']/li[@class='active']/a/@href").extract()[0]
item["stu"] = response.xpath("//span[@class='course-view']/text()").extract()[0]
yield item
for i in range(1,142):
url = "https://edu.hellobi.com/course/" + str(i)
yield scrapy.Request(url=url,callback=self.parse)
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class TianshanPipeline(object):
def __init__(self):
file = "E:\\test\\tianshan\\lessons.txt"
# open(file,"a")
self.fh = open(file, "w")
def process_item(self, item, spider):
print(item["name"])
print(item["link"])
print(item["stu"])
print("")
self.fh.write(item["name"] + "\n" + item["link"] + "\n" + item["stu"] + "\n" + "******************")
return item
def close_spider(self):
self.fh.close()