再也不用花钱买漫画!Python爬取某漫画的脚本及源码_Python

一、工具

python3
第三方类库requests
python3-pyqt5(gui依赖，不用gui可不装)

ubuntu系列系统使用以下命令安装依赖：

URL格式: 漫画首页的URL，如http://m.ac.qq.com/Comic/view/id/518333(移动版) 或 http://ac.qq.com/Comic/comicInfo/id/17114, http://ac.qq.com/naruto(PC版)

注意: 火影忍者彩漫需要访问m.ac.qq.com搜索火影忍者，因为pc端页面火影忍者彩漫和黑白漫画是一个id一个url。

二、命令行帮助

				?

									usage: getcomic.py [-h] [-u url] [-p path] [-d] [-l list]

									*下载腾讯漫画，仅供学习交流，请勿用于非法用途*

									空参运行进入交互式模式运行。

									optional arguments:

									  -h, --help            show this help message and exit

									  -u url, --url url     要下载的漫画的首页，可以下载以下类型的url: 

									                        http://ac.qq.com/comic/comicinfo/id/511915

									                        http://m.ac.qq.com/comic/comicinfo/id/505430

									                        http://pad.ac.qq.com/comic/comicinfo/id/505430

									                        http://ac.qq.com/naruto

									  -p path, --path path  漫画下载路径。 默认: /home/fengyu/tencent_comic

									  -d, --dir             将所有图片下载到一个目录(适合腾讯漫画等软件连看使用)

									  -l list, --list list  要下载的漫画章节列表，不指定则下载所有章节。格式范例: 

									                        n - 下载具体某一章节，如-l 1, 下载第1章

									                        n,n... - 下载某几个不连续的章节，如 "-l 1,3,5", 下载1,3,5章

									                        n-n... - 下载某一段连续的章节，如 "-l 10-50", 下载[10,50]章

									                        杂合型 - 结合上面所有的规则，如 "-l 1,3,5-7,11-111"

三、gui预览效果

支持不连续的章节选择下载

windows预览效果：

再也不用花钱买漫画!Python爬取某漫画的脚本及源码

deepin/linux 预览效果：

再也不用花钱买漫画!Python爬取某漫画的脚本及源码

四、全部源码

				?

									import requests

									import re

									import json

									import os

									import argparse

									requestsession = requests.session()

									ua = 'mozilla/5.0 (ipad; cpu os 5_1 like mac os x; en-us) \

									        applewebkit/534.46 (khtml, like gecko) version/5.1 \

									        mobile/9b176 safari/7534.48.3' # ipad ua

									requestsession.headers.update({'user-agent': ua})

									class errorcode(exception):

									    '''自定义错误码:

									        1: url不正确

									        2: url无法跳转为移动端url

									        3: 中断下载'''

									    def __init__(self, code):

									        self.code = code

									    def __str__(self):

									        return repr(self.code)

									def islegelurl(url):

									    legal_url_list = [

									        re.compile(r'^http://ac.qq.com/comic/[cc]omicinfo/id/\d+/?$'),

									        re.compile(r'^http://m.ac.qq.com/comic/[cc]omicinfo/id/\d+/?$'),

									        re.compile(r'^http://ac.qq.com/\w+/?$'),

									        re.compile(r'^http://pad.ac.qq.com/comic/[cc]omicinfo/id/\d+/?$')

									    ]

									    for legal_url in legal_url_list:

									        if legal_url.match(url):

									            return true

									    return false

									def getid(url):

									    if not islegelurl(url):

									        print('请输入正确的url！具体支持的url请在命令行输入-h|--help参数查看帮助文档。')

									        raise errorcode(1)

									    numre = re.compile(r'\d+$')

									    id = numre.findall(url)

									    if not id:

									        get_id_request = requestsession.get(url)

									        url = get_id_request.url

									        id = numre.findall(url)

									        if not islegelurl(url) or not id:

									            print('无法自动跳转移动端url，请进入http://m.ac.qq.com，找到'

									            '该漫画地址。\n'

									            '地址应该像这样: '

									            'http://m.ac.qq.com/comic/comicinfo/id/xxxxx (xxxxx为整数)')

									            raise errorcode(2)

									    return id[0]

									def getcontent(id):

									    getcomicinfourl = 'http://pad.ac.qq.com/getdata/getcomicinfo?id={}'.format(id)

									    requestsession.headers.update({'cookie': 'ac_refer=http://pad.ac.qq.com'})

									    requestsession.headers.update({'referer': 'http://pad.ac.qq.com'})

									    getcomicinfo = requestsession.get(getcomicinfourl)

									    comicinfojson = getcomicinfo.text

									    comicinfo = json.loads(comicinfojson)

									    comicname = comicinfo['title']

									    comicintrd = comicinfo['brief_intrd']

									    getchapterlisturl = 'http://pad.ac.qq.com/getdata/getchapterlist?id={}'.format(id)

									    getchapterlist = requestsession.get(getchapterlisturl)

									    contentjson = json.loads(getchapterlist.text)

									    count = contentjson['length']

									    sortedcontentlist = []

									    for i in range(count + 1):

									        for item in contentjson:

									            if isinstance(contentjson[item], dict) and contentjson[item].get('seq') == i:

									                sortedcontentlist.append({item: contentjson[item]})

									                break

									    return (comicname, comicintrd, count, sortedcontentlist)

									def getimglist(contentjson, id):

									    cid = list(contentjson.keys())[0]

									    getpichashurl = 'http://pad.ac.qq.com/view/mgetpichash?id={}&cid={}'.format(id, cid)

									    picjsonpage = requestsession.get(getpichashurl).text

									    picjson = json.loads(picjsonpage)

									    count = picjson['pcount']    #统计图片数量

									    phash = picjson['phash']

									    sortedimgdictlist = []

									    for i in range(1, count + 1):

									        for item in phash:

									            if phash[item]['seq'] == i:

									                sortedimgdictlist.append(phash[item])

									                break

									    imglist = []

									    for imgdict in sortedimgdictlist:

									        k = imgdict['cid']

									        m = imgdict['pid']

									        j = int(id)

									        uin = max(j + k + m, 10001)

									        l = [j % 1000 // 100, j % 100, j, k]

									        n = '/mif800/' + '/'.join(str(j) for j in l) + '/'

									        h = str(m) + '.mif2'

									        g="http://ac.tc.qq.com/store_file_download?buid=15017&uin="+str(uin)+"&dir_path="+n+"&name="+h

									        imglist.append(g)

									    return imglist

									def downloadimg(imgurllist, contentpath, one_folder=false):

									    count = len(imgurllist)

									    print('该集漫画共计{}张图片'.format(count))

									    i = 1

									    for imgurl in imgurllist:

									        print('\r正在下载第{}张图片...'.format(i), end = '')

									        if not one_folder:

									            imgpath = os.path.join(contentpath, '{0:0>3}.jpg'.format(i))

									        else:

									            imgpath = contentpath + '{0:0>3}.jpg'.format(i)

									        i += 1

									        #目标文件存在就跳过下载

									        if os.path.isfile(imgpath):

									            continue

									        try:

									            downloadrequest = requestsession.get(imgurl, stream=true)

									            with open(imgpath, 'wb') as f:

									                for chunk in downloadrequest.iter_content(chunk_size=1024): 

									                    if chunk: # filter out keep-alive new chunks

									                        f.write(chunk)

									                        f.flush()

									        except (keyboardinterrupt, systemexit):

									            print('\n\n中断下载，删除未下载完的文件！')

									            if os.path.isfile(imgpath):

									                os.remove(imgpath)

									            raise errorcode(3)

									    print('完毕!\n')

									def parselist(lst):

									    '''解析命令行中的-l|--list参数，返回解析后的章节列表'''

									    legallistre = re.compile(r'^\d+([,-]\d+)*$')

									    if not legallistre.match(lst):

									        raise listformaterror(lst + ' 不匹配正则: ' + r'^\d+([,-]\d+)*$')

									    #先逗号分割字符串，分割后的字符串再用短横杠分割

									    parsedlist = []

									    sublist = lst.split(',')

									    numre = re.compile(r'^\d+$')

									    for sub in sublist:

									        if numre.match(sub):

									            if int(sub) > 0: #自动忽略掉数字0

									                parsedlist.append(int(sub))

									            else:

									                print('警告: 参数中包括不存在的章节0，自动忽略')

									        else:

									            splitnum = list(map(int, sub.split('-')))

									            maxnum = max(splitnum)

									            minnum = min(splitnum)       #min-max或max-min都支持

									            if minnum == 0:

									                minnum = 1               #忽略数字0

									                print('警告: 参数中包括不存在的章节0，自动忽略')

									            parsedlist.extend(range(minnum, maxnum+1))

									    parsedlist = sorted(set(parsedlist)) #按照从小到大的顺序排序并去重

									    return parsedlist

									def main(url, path, lst=none, one_folder=false):

									    '''url: 要爬取的漫画首页。 path: 漫画下载路径。 lst: 要下载的章节列表(-l|--list后面的参数)'''

									    try:

									        if not os.path.isdir(path):

									           os.makedirs(path)

									        id = getid(url)

									        comicname,comicintrd,count,contentlist = getcontent(id)

									        contentnamelist = []

									        for item in contentlist:

									            for k in item:

									                contentnamelist.append(item[k]['t'])

									        print('漫画名: {}'.format(comicname))

									        print('简介: {}'.format(comicintrd))

									        print('章节数: {}'.format(count))

									        print('章节列表:')

									        try:

									            print('\n'.join(contentnamelist))

									        except exception:

									            print('章节列表包含无法解析的特殊字符\n')

									        forbiddenre = re.compile(r'[\\/":*?<>|]') #windows下文件名非法字符\ / : * ? " < > |

									        comicname = re.sub(forbiddenre, '_', comicname) #将windows下的非法字符一律替换为_

									        comicpath = os.path.join(path, comicname)

									        if not os.path.isdir(comicpath):

									            os.makedirs(comicpath)

									        print()

									        if not lst:

									            contentrange = range(1, len(contentlist) + 1)

									        else:

									            contentrange = parselist(lst)

									        for i in contentrange:

									            if i > len(contentlist):

									                print('警告: 章节总数 {} ,'

									                        '参数中包含过大数值,'

									                        '自动忽略'.format(len(contentlist)))

									                break

									            contentnamelist[i - 1] = re.sub(forbiddenre, '_', contentnamelist[i - 1]) #将windows下的非法字符一律替换为_

									            contentpath = os.path.join(comicpath, '第{0:0>4}话-{1}'.format(i, contentnamelist[i - 1]))

									            try:

									                print('正在下载第{0:0>4}话: {1}'.format(i, contentnamelist[i -1]))

									            except exception:

									                print('正在下载第{0:0>4}话: {1}'.format(i))

									            if not one_folder:

									                if not os.path.isdir(contentpath):

									                    os.mkdir(contentpath)

									            imglist = getimglist(contentlist[i - 1], id)

									            downloadimg(imglist, contentpath, one_folder)

									    except errorcode as e:

									        exit(e.code)

									if __name__ == '__main__':

									    defaultpath = os.path.join(os.path.expanduser('~'), 'tencent_comic')

									    parser = argparse.argumentparser(formatter_class=argparse.rawtexthelpformatter,

									                                     description='*下载腾讯漫画，仅供学习交流，请勿用于非法用途*\n'

									                                     '空参运行进入交互式模式运行。')

									    parser.add_argument('-u', '--url', help='要下载的漫画的首页，可以下载以下类型的url: \n'

									            'http://ac.qq.com/comic/comicinfo/id/511915\n'

									            'http://m.ac.qq.com/comic/comicinfo/id/505430\n'

									            'http://pad.ac.qq.com/comic/comicinfo/id/505430\n'

									            'http://ac.qq.com/naruto')

									    parser.add_argument('-p', '--path', help='漫画下载路径。 默认: {}'.format(defaultpath), 

									                default=defaultpath)

									    parser.add_argument('-d', '--dir', action='store_true', help='将所有图片下载到一个目录(适合腾讯漫画等软件连看使用)')

									    parser.add_argument('-l', '--list', help=("要下载的漫画章节列表，不指定则下载所有章节。格式范例: \n"

									                                              "n - 下载具体某一章节，如-l 1, 下载第1章\n"

									                                              'n,n... - 下载某几个不连续的章节，如 "-l 1,3,5", 下载1,3,5章\n'

									                                              'n-n... - 下载某一段连续的章节，如 "-l 10-50", 下载[10,50]章\n'

									                                              '杂合型 - 结合上面所有的规则，如 "-l 1,3,5-7,11-111"'))

									    args = parser.parse_args()

									    url = args.url

									    path = args.path

									    lst = args.list

									    one_folder = args.dir

									    if lst:

									        legallistre = re.compile(r'^\d+([,-]\d+)*$')

									        if not legallistre.match(lst):

									            print('list参数不合法，请参考--help键入合法参数！')

									            exit(1)

									    if not url:

									        url = input('请输入漫画首页地址: ')

									        path = input('请输入漫画保存路径(默认: {}): '.format(defaultpath))

									        if not path:

									            path = defaultpath

									    main(url, path, lst, one_folder)

五、下载源码

				?

									from pyqt5.qtcore import *

									from pyqt5.qtwidgets import *

									from pyqt5.qtgui import *

									import getcomic

									import os

									import re

									import sys

									class tencentcomicdownloader(qwidget):

									    def __init__(self, parent=none):

									        super(tencentcomicdownloader, self).__init__(parent)

									        namelabel = qlabel("漫画首页:")

									        self.nameline = qlineedit()

									        self.analysisbutton = qpushbutton("分析")

									        self.analysisbutton.clicked.connect(self.anaysisurl)

									        self.nameline.returnpressed.connect(self.analysisbutton.click)

									        pathlinelabel = qlabel("下载路径:")

									        self.pathline = qlineedit()

									        defaultpath = os.path.join(os.path.expanduser('~'), 'tencent_comic')

									        self.pathline.settext(defaultpath)

									        self.browsebutton = qpushbutton("浏览")

									        self.browsebutton.clicked.connect(self.getpath)

									        comicnamelabel = qlabel("漫画名: ")

									        self.comicnamelabel = qlabel("暂无")

									        self.one_folder_checkbox = qcheckbox("单目录")

									        comicintrolabel = qlabel("简介: ")

									        self.comicintro = qlabel("暂无")

									        self.comicintro.setwordwrap(true)

									        chaptergroupbox = qgroupbox("章节列表:")

									        self.chapterlistview = qlistwidget(chaptergroupbox)

									        self.chapterlistview.setselectionmode(qabstractitemview.extendedselection)

									        self.chapterlistview.setenabled(false)

									        groupboxlayout = qhboxlayout(chaptergroupbox)

									        groupboxlayout.addwidget(self.chapterlistview)

									        self.downloadbutton = qpushbutton("下载选中")

									        self.statuslabel = qlabel("输入要下载的漫画的首页，然后点分析")

									        self.statuslabel.setwordwrap(true)

									        self.downloadbutton.setenabled(false)

									        self.downloadbutton.clicked.connect(self.download)

									        mainlayout = qgridlayout()

									        mainlayout.addwidget(namelabel, 0, 0)

									        mainlayout.addwidget(self.nameline, 0, 1)

									        mainlayout.addwidget(self.analysisbutton, 0, 2)

									        mainlayout.addwidget(pathlinelabel, 1, 0)

									        mainlayout.addwidget(self.pathline, 1, 1)

									        mainlayout.addwidget(self.browsebutton, 1, 2)

									        mainlayout.addwidget(comicnamelabel, 2, 0)

									        mainlayout.addwidget(self.comicnamelabel, 2, 1, 1, 2)

									        mainlayout.addwidget(self.one_folder_checkbox, 2, 2)

									        mainlayout.addwidget(comicintrolabel, 3, 0)

									        mainlayout.addwidget(self.comicintro, 3, 1, 1, 2)

									        mainlayout.addwidget(chaptergroupbox, 4, 0, 1, 3)

									        mainlayout.addwidget(self.downloadbutton, 5, 2)

									        mainlayout.addwidget(self.statuslabel, 5, 0, 1, 2)

									        self.setlayout(mainlayout)

									        self.setwindowtitle("腾讯漫画下载")

									        self.setgeometry(400, 300, 800, 500)

									    def setstatus(self, status):

									        self.statuslabel.settext(status)

									    def enablewidget(self, enable):

									        widgets_list = [

									                self.downloadbutton,

									                self.nameline,

									                self.pathline,

									                self.chapterlistview,

									                self.analysisbutton,

									                self.browsebutton,

									                self.one_folder_checkbox

									        ]

									        for widget in widgets_list:

									            widget.setenabled(enable)

									        if enable:

									            self.downloadbutton.settext('下载选中')

									            self.chapterlistview.setfocus()

									    def getpath(self):

									        path = str(qfiledialog.getexistingdirectory(self, "选择下载目录"))

									        if path:

									            self.pathline.settext(path)

									    def anaysisurl(self):

									        url = self.nameline.text()

									        self.downloadbutton.setenabled(false)

									        self.comicnamelabel.settext("暂无")

									        self.comicintro.settext("暂无")

									        self.chapterlistview.clear()

									        self.chapterlistview.setenabled(false)

									        try:

									            if getcomic.islegelurl(url):

									                self.id = getcomic.getid(url)

									                self.comicname,self.comicintrd,self.count,self.contentlist = getcomic.getcontent(self.id)

									                self.contentnamelist = []

									                for item in self.contentlist:

									                    for k in item:

									                        self.contentnamelist.append(item[k]['t'])

									                self.comicnamelabel.settext(self.comicname)

									                self.comicintro.settext(self.comicintrd)

									                self.chapterlistview.setenabled(true)

									                self.downloadbutton.setenabled(true)

									                self.chapterlistview.setfocus()

									                self.statuslabel.settext('选择要下载的章节后点击右侧按钮')

									                for i in range(len(self.contentnamelist)):

									                    self.chapterlistview.additem('第{0:0>4}话-{1}'.format(i+1, self.contentnamelist[i]))

									                    self.chapterlistview.item(i).setselected(true)

									                self.downloadbutton.setenabled(true)

									            else:

									                self.statuslabel.settext('<font color="red">错误的url格式！请输入正确的漫画首页地址！</font>')

									        except getcomic.errorcode as e:

									            if e.code == 2:

									                self.statuslabel.settext('<font color="red">无法跳转为移动端url,请进入http://m.ac.qq.com找到该漫画地址</font>')

									        except keyerror:

									            self.statuslabel.settext('<font color="red">不存在的地址</font>')

									    def download(self):

									        self.downloadbutton.settext("下载中...")

									        one_folder = self.one_folder_checkbox.ischecked()

									        self.enablewidget(false)

									        selectedchapterlist = [ item.row() for item in self.chapterlistview.selectedindexes() ]

									        path = self.pathline.text()

									        comicname = self.comicname

									        forbiddenre = re.compile(r'[\\/":*?<>|]') #windows下文件名非法字符\ / : * ? " < > |

									        comicname = re.sub(forbiddenre, '_', comicname) #将windows下的非法字符一律替换为_

									        comicpath = os.path.join(path, comicname)

									        if not os.path.isdir(comicpath):

									            os.makedirs(comicpath)

									        self.downloadthread = downloader(selectedchapterlist, comicpath, self.contentlist, self.contentnamelist, self.id, one_folder)

									        self.downloadthread.output.connect(self.setstatus)

									        self.downloadthread.finished.connect(lambda: self.enablewidget(true))

									        self.downloadthread.start()

									class downloader(qthread):

									    output = pyqtsignal(['qstring'])

									    finished = pyqtsignal()

									    def __init__(self, selectedchapterlist, comicpath, contentlist, contentnamelist, id, one_folder=false, parent=none):

									        super(downloader, self).__init__(parent)

									        self.selectedchapterlist = selectedchapterlist

									        self.comicpath = comicpath

									        self.contentlist = contentlist

									        self.contentnamelist = contentnamelist

									        self.id = id

									        self.one_folder = one_folder

									    def run(self):

									        try:

									            for i in self.selectedchapterlist:

									                outputstring = '正在下载第{0:0>4}话: {1}...'.format(i+1, self.contentnamelist[i])

									                print(outputstring)

									                self.output.emit(outputstring)

									                forbiddenre = re.compile(r'[\\/":*?<>|]') #windows下文件名非法字符\ / : * ? " < > |

									                self.contentnamelist[i] = re.sub(forbiddenre, '_', self.contentnamelist[i])

									                contentpath = os.path.join(self.comicpath, '第{0:0>4}话-{1}'.format(i+1, self.contentnamelist[i]))

									                if not self.one_folder:

									                    if not os.path.isdir(contentpath):

									                        os.mkdir(contentpath)

									                imglist = getcomic.getimglist(self.contentlist[i], self.id)

									                getcomic.downloadimg(imglist, contentpath, self.one_folder)

									                self.output.emit('完毕!')

									        except exception as e:

									            self.output.emit('<font color="red">{}</font>\n'

									                    '遇到异常!请尝试重新点击下载按钮重试'.format(e))

									            raise

									        finally:

									            self.finished.emit()

									if __name__ == '__main__':

									    app = qapplication(sys.argv)

									    main = tencentcomicdownloader()

									    main.show()

									    app.exec_()