搜索
您的当前位置:首页正文

[python 爬虫04] 邪恶动态图

来源:二三娱乐

代码

# -*- coding:utf8 -*-

'''


sudo -H pip install requests beautifulsoup
'''

import os               # path, makedirs
import requests         # 网络请求
import urllib           # 下载文件
from bs4 import BeautifulSoup # 网页分析
import re               # 正则表达式

CURR_DIR = os.path.dirname(os.path.abspath(__file__))
FOLDRE = 'xeba'

def downloadUrl(url, path):
    print url, path
    if os.path.exists(path): return
    mkdir(os.path.dirname(path))
    urllib.urlretrieve(url, path)

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def fixUrl(url):
    return  + url

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}

url = 
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
item = soup.find('div', class_='kg')
count = int(item.span.text)
print u'已有%d集漫画.' % count


title = item.a.get('title')
url = fixUrl(item.a.get('href'))
print title, url

def parsePage(url, subpage=True):
    print 'url=', url
    html = requests.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')
    item = soup.find('li', id='imgshow')
    try:
        title, img_url = item.img.get('alt'), item.img.get('src')
        print title, img_url
        downloadUrl(img_url, os.path.join(CURR_DIR, FOLDRE, os.path.basename(img_url)))
    except Exception,e:
        print u'解析图片失败 %s' % e.message

    if subpage:
        pagelist = soup.find('ul', class_='pagelist').find_all('li')
        if len(pagelist):
            # 使用正则表达式获取 分页
            pagecount = re.findall("\d+", pagelist[0].a.text)[0]
            pagecount = int(pagecount)
            print u'子页面数量:', pagecount
            baseurl = url.replace('.html', '')
            for index in xrange(2, pagecount+1):
                nexturl = '%s_%d.html' % (baseurl, index)
                pagelist += nexturl
                print nexturl
                parsePage(nexturl, subpage=False)
        url = re.findall("var str = \S+<a href='(\S+)'", html.text)[1]
        url = fixUrl(url)
        print u'下一页是', url
        return url
    else:
        return ''


# test case
# 
# 

# count=5
for x in xrange(1,count+1):
    url = parsePage(url)

后记

因为 下一张 索引使用 js 生产的,所以使用正则表达式进行匹配。
有些页面是没有动态图片的,注意处理异常情况。

Top