今天看了一下beautifulsoup库的用法,把昨天的python爬取百度贴吧的图片1的代码更新成使用beautifulsoup库的函数来实现。用的还是不太熟练,但是感觉比正则表达式写起来容易了一些。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import urllib
import re
class imgTest:
def __init__(self, baseUrl, seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
#print to log.txt
def printToLog(self,mystr):
f = open('txt/log.txt', 'a')
f.write(mystr+"\n")
f.close()
#get the html source code
def getPage(self, pageNum):
try:
url = self.baseUrl+self.seeLZ +'&pn='+str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
content = response.read()
return content
except urllib2.URLError, e:
if hasattr(e, "reason"):
print "failed to connect baidutieba.",e.reason
return None
def getPageNum(self):
page = self.getPage(1)
soup = BeautifulSoup(page,'html.parser')
pageNum = soup.find_all("span",class_='red')[1].string
return pageNum
def getTitle(self):
page = self.getPage(1)
soup = BeautifulSoup(page,'html.parser')
return soup.h3.string
def getAllImageURLs(self,pageNum):
page = self.getPage(pageNum)
soup = BeautifulSoup(page,'html.parser')
imgTags = soup.find_all("img",class_="BDE_Image")
imgURLs = []
for item in imgTags:
imgURLs.append(item.get('src'))
print imgURLs
return imgURLs
#save a single img
def saveImg(self,imageURL,filename):
u = urllib.urlopen(imageURL)
data = u.read()
f = open(filename,'wb')
f.write(data)
f.close()
#download images
def saveImgs(self, images, name, num):
number = num
for imageURL in images:
splitPath = imageURL.split('.')
fTail = splitPath.pop()
if len(fTail)>3:
fTail = "jpg"
fileName = name+"/"+str(number)+"."+fTail
self.saveImg(imageURL,fileName)
number += 1
baseURL = 'http://tieba.baidu.com/p/3925387672'
imgtest = imgTest(baseURL,1)
totalnum = int(imgtest.getPageNum())
imageCount = 0
for i in range(1, totalnum+1):
imageURLs = imgtest.getAllImageURLs(i)
imgtest.saveImgs(imageURLs,"pic",imageCount)
imageCount += len(imageURLs)
print imageCount
附上beautifulsoup的文档看看吧,就酱。