代码
# -*- coding:utf8 -*-
'''
sudo -H pip install requests beautifulsoup
'''
import os # path, makedirs
import requests # 网络请求
import urllib # 下载文件
from bs4 import BeautifulSoup # 网页分析
import re # 正则表达式
CURR_DIR = os.path.dirname(os.path.abspath(__file__))
FOLDRE = 'xeba'
def downloadUrl(url, path):
print url, path
if os.path.exists(path): return
mkdir(os.path.dirname(path))
urllib.urlretrieve(url, path)
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
def fixUrl(url):
return + url
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
url =
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
item = soup.find('div', class_='kg')
count = int(item.span.text)
print u'已有%d集漫画.' % count
title = item.a.get('title')
url = fixUrl(item.a.get('href'))
print title, url
def parsePage(url, subpage=True):
print 'url=', url
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
item = soup.find('li', id='imgshow')
try:
title, img_url = item.img.get('alt'), item.img.get('src')
print title, img_url
downloadUrl(img_url, os.path.join(CURR_DIR, FOLDRE, os.path.basename(img_url)))
except Exception,e:
print u'解析图片失败 %s' % e.message
if subpage:
pagelist = soup.find('ul', class_='pagelist').find_all('li')
if len(pagelist):
# 使用正则表达式获取 分页
pagecount = re.findall("\d+", pagelist[0].a.text)[0]
pagecount = int(pagecount)
print u'子页面数量:', pagecount
baseurl = url.replace('.html', '')
for index in xrange(2, pagecount+1):
nexturl = '%s_%d.html' % (baseurl, index)
pagelist += nexturl
print nexturl
parsePage(nexturl, subpage=False)
url = re.findall("var str = \S+<a href='(\S+)'", html.text)[1]
url = fixUrl(url)
print u'下一页是', url
return url
else:
return ''
# test case
#
#
# count=5
for x in xrange(1,count+1):
url = parsePage(url)
后记
因为 下一张 索引使用 js 生产的,所以使用正则表达式进行匹配。
有些页面是没有动态图片的,注意处理异常情况。