程序实现的原理很简单,就是先把网页提取出来,再提取a标签,过滤出href。
方法1:
html = urllib2.urlopen(url).read()
# html - unicode(html, 'gb2312', 'ignore').encode('utf-8', 'ignore')
content = BS(html).findAll('a')
myfile = open(localfile, 'w')
pat = re.compile(r'href="([^"]*)"')
pat2 = re.compile(r'http')
for item in content:
h = pat.search(str(item))
href = h.group(1)
if pat2.search(href):
ans = href
else:
ans = url + href
方法2:
def extractlinks(html):
soup = BS(html)
anchors = soup.findAll('a')
links = []
for a in anchors:
links.append(a['href'])
return links
方法3:
base_url = "http://www.hao123.com"
html = urllib2.urlopen(base_url).read()
soup = BS(html)
urls = soup.findAll('a')
links = []
for url in urls:
links.append(url["href"])
for i in range(len(links)):
print links[i]