I write this crawler because I like to read comics, and I practice my coding standards. If there is any improvement, please leave a message in the comment area.Copy the code

Making the address

https://github.com/iicey/pufei
Copy the code

The crawler code

import os
import re
import threading
import time
from urllib.parse import quote

import execjs
import requests
from lxml import etree
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S', filemode='r')
# filemode='a', filename=os.path.join(os.getcwd(), 'log', 'mh.log'


class PuFei:

    def __init__(self, keyboard):
        self.headers = {"User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit / 537.36"
                                      "(KHTML like Gecko) Chrome/75.0.3770.80 Safari/537.36"}
        self.search_url = 'http://m.pufei.net/e/search/?'
        self.keyboard = keyboard

    def search(self):
        url = self.search_url + 'searchget=1&tbname=mh&show=title,player,playadmin,bieming,pinyin,playadmin&' \
                                'tempid=4&keyboard=' + quote(self.keyboard, encoding='gb2312')
        result_search = requests.get(url=url, headers=self.headers)
        try:
            tree_search = etree.HTML(result_search.text)
            mh_url = ' '.join(('http://m.pufei.net', tree_search.xpath('//*[@id="detail"]/li[1]/a/@href')[0]))
        except IndexError as e:
            logging.error(result_search.text)
            logging.error(e)
            return
        result_chapter = requests.get(url=mh_url, headers=self.headers)
        result_chapter.encoding = 'gb2312'
        try:
            tree_chapter = etree.HTML(result_chapter.text)
            for index, (href, title) in enumerate(zip(tree_chapter.xpath('//*[@id="chapterList2"]/ul/li/a/@href')[::-1],
                                                      tree_chapter.xpath('//*[@id="chapterList2"]/ul/li/a/@title')[
                                                      ::-1])):
                chapter_url = ' '.join(('http://m.pufei.net', href))
                r_str = r'[\ / : *? "< > |]'
                title = re.sub(r_str, "", title)
                dir_name = f"{index}_{title}"
                dir_path = os.path.join(os.getcwd(), self.keyboard, dir_name)
                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                self.chapter(chapter_url, dir_path)
                logging.info(f"{dir_path}, {chapter_url}")
        except IndexError as e:
            logging.error(result_chapter.text)
            logging.error(e)
            return

    def chapter(self, chapter_url, dir_path):
        result_chapter = requests.get(url=chapter_url, headers=self.headers)
        result_chapter.encoding = 'gb2312'
        cp = re.findall(r'cp="\w+.*"', result_chapter.text)[0][4:-1]
        page_list = self.page_url_list(cp)
        for index, value in enumerate(page_list):
            page_url = "http://res.img.pufei.net/" + value
            file_path = os.path.join(dir_path, f"{index}.jpg")
            ifnot os.path.exists(file_path): Threading.thread (target=self. Page, args=(page_url, file_path)).start() def page(self, page_url, file_path). file_path): try: result_page = requests.get(url=page_url, headers=self.headers) with open(file_path,'wb') as fp:
                fp.write(result_page.content)
            logging.info(file_path)
        except Exception as e:
            logging.error(f"{page_url} - {e}")

    @staticmethod
    def page_url_list(cp):
        js_code = ' '' function base64decode(str) { var base64EncodeChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; var base64DecodeChars = new Array(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 62, 1, 1, 1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 1, 1, 1, 1, 1, 1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 1, 1, 1, 1, 1); var c1, c2, c3, c4; var i, len, out; len = str.length; i = 0; out = ""; while (i < len) { do { c1 = base64DecodeChars[str.charCodeAt(i++) & 255] } while (i < len && c1 == -1); if (c1 == -1) { break } do { c2 = base64DecodeChars[str.charCodeAt(i++) & 255] } while (i < len && c2 == -1); if (c2 == -1) { break } out += String.fromCharCode((c1 << 2) | ((c2 & 48) >> 4)); do { c3 = str.charCodeAt(i++) & 255; if (c3 == 61) { return out } c3 = base64DecodeChars[c3] } while (i < len && c3 == -1); if (c3 == -1) { break } out += String.fromCharCode(((c2 & 15) << 4) | ((c3 & 60) >> 2)); do { c4 = str.charCodeAt(i++) & 255; if (c4 == 61) { return out } c4 = base64DecodeChars[c4] } while (i < len && c4 == -1); if (c4 == -1) { break } out += String.fromCharCode(((c3 & 3) << 6) | c4) } return out } function geturl(cp) { value = eval(eval(base64decode(cp).slice(4))); return value } '' '
        js_context = execjs.compile(js_code)
        return js_context.call('geturl', cp)


if __name__ == '__main__':
    PuFei(input('Please enter comic name:')).search()

Copy the code