1、简单抓取一个网页的数据信息
var http = require('http'),
cheerio = require('cheerio'),
url = 'http://www.imooc.com/learn/348';
//过滤
function filterChapter(html) {
var $ = cheerio.load(html);
var chapters = $('.chapter');
var courseData = [];
chapters.each(function(index, item) {
var chapter = $(this);
var chapterTitle = chapter.find('strong').text();
var videos = chapter.find('.video').children('li');
var chapterData = {
chapterTitle: chapterTitle,
videos: []
};
videos.each(function(index, item) {
var video = $(this).find('.J-media-item');
var videoTitle = video.text().replace(/[ ]/g, "")
.replace(/[\r\n]/g, "")
.replace(/[开始学习]/g, "");
var id = video.attr('href').split('video/')[1];
chapterData.videos.push({
title: videoTitle,
id: id
});
});
courseData.push(chapterData);
});
return courseData;
}
//打印
function printCourseInfo(courseData) {
courseData.forEach(function(item) {
var chapterTitle = item.chapterTitle;
item.videos.forEach(function(video) {
console.log('【' + video.id + '】' + video.title + '\n')
})
})
}
http.get(url, (res) => {
var html = '';
res.on('data', (data) => {
html += data;
});
res.on('end', () => {
var courseData = filterChapter(html);
printCourseInfo(courseData);
});
}).on('error', () => {
console.log('获取课程信息出错');
});
2、利用 Promise 同时异步请求多个页面
var http = require('http'),
cheerio = require('cheerio'),
Promise = require('Promise'),
baseUrl = 'http://www.imooc.com/learn/',
videoIds = [348, 637];
//过滤
function filterChapter(html) {
var $ = cheerio.load(html),
chapters = $('.chapter');
var courseData = {
title: $('.course-infos h2').text().trim().replace(/[\r\n]/g, ""),
videos: []
}
chapters.each(function(index, item) {
var chapter = $(this);
var chapterTitle = chapter.find('strong').text().replace(/[ ]/g, "").replace(/[\r\n]/g, "");
var videos = chapter.find('.video').children('li');
var chapterData = {
chapterTitle: chapterTitle,
videos: []
};
videos.each(function(index, item) {
var video = $(this).find('.J-media-item');
var videoTitle = video.text().replace(/[ ]/g, "")
.replace(/[\r\n]/g, "")
.replace(/[开始学习]/g, "");
var id = video.attr('href').split('video/')[1];
chapterData.videos.push({
title: videoTitle,
id: id
});
});
courseData.videos.push(chapterData);
});
return courseData;
}
//打印
function printCourseInfo(coursesData) {
coursesData.forEach(function(courseData){
console.log('\n **'+courseData.title +'**\n ');
courseData.videos.forEach(function(item) {
console.log(item.chapterTitle);
var chapterTitle = item.chapterTitle;
item.videos.forEach(function(video) {
console.log('【' + video.id + '】' + video.title)
})
})
})
}
//利用Promise 异步请求每一个URL的模版数据
function asyncAllPages(url){
return new Promise(function(resolve, reject){
console.log('启动爬虫1号:' + url);
http.get(url, (res) => {
var html = '';
res.on('data', (data) => {
html += data;
});
res.on('end', () => {
resolve(html);
});
}).on('error', () => {
reject(e);
console.log('获取课程信息出错');
});
})
}
//存入所有页面请求到的模版数据
var fetchPageArray = [];
videoIds.forEach(function(id){
fetchPageArray.push(asyncAllPages(baseUrl + id));
})
//利用Promise请求所有的页面, pages对应的就是请求每个页面过后传递下来的数据 的数组
Promise
.all(fetchPageArray)
.then(function(pages){
var coursesData = [];
pages.forEach(function(html){
var course = filterChapter(html);
coursesData.push(course);
})
printCourseInfo(coursesData);
})
3、模拟提交请求
var http=require('http')
var querystring = require('querystring')
var postData = querystring.stringify({
'content':'node learning srart now ! let\'s go !',
'mid':8837
})
var options ={
hostname : 'www.imooc.com',
port:80,
path :'/course/docomment',
method:'POST',
headers:{
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Content-Length':postData.length,
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'...',
'Host':'www.imooc.com',
'Origin':'http://www.imooc.com',
'Pragma':'no-cache',
'Referer':'http://www.imooc.com/video/8837',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
}
var req = http.request(options, function(res){
console.log('status:'+res.statusCode);
console.log('headers:'+JSON.stringify(res.headers));
res.on('data',function(chunk){
console.log(Buffer.isBuffer(chunk))
console.log(typeof chunk);
})
res.on('end',function(){
console.log('评论完毕')
});
res.on('error',function(e){
console.log('Error:'+e.message)
})
})
req.write(postData);
req.end();