Pufei comics web crawler, complete code attached

I write this crawler because I like to read comics, and I practice my coding standards. If there is any improvement, please leave a message in the comment area.Copy the code

Making the address

https://github.com/iicey/pufei
Copy the code

The crawler code

import os
import re
import threading
import time
from urllib.parse import quote

import execjs
import requests
from lxml import etree
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S', filemode='r')
# filemode='a', filename=os.path.join(os.getcwd(), 'log', 'mh.log'


class PuFei:

    def __init__(self, keyboard):
        self.headers = {"User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit / 537.36"
                                      "(KHTML like Gecko) Chrome/75.0.3770.80 Safari/537.36"}
        self.search_url = 'http://m.pufei.net/e/search/?'
        self.keyboard = keyboard

    def search(self):
        url = self.search_url + 'searchget=1&tbname=mh&show=title,player,playadmin,bieming,pinyin,playadmin&' \
                                'tempid=4&keyboard=' + quote(self.keyboard, encoding='gb2312')
        result_search = requests.get(url=url, headers=self.headers)
        try:
            tree_search = etree.HTML(result_search.text)
            mh_url = ' '.join(('http://m.pufei.net', tree_search.xpath('//*[@id="detail"]/li[1]/a/@href')[0]))
        except IndexError as e:
            logging.error(result_search.text)
            logging.error(e)
            return
        result_chapter = requests.get(url=mh_url, headers=self.headers)
        result_chapter.encoding = 'gb2312'
        try:
            tree_chapter = etree.HTML(result_chapter.text)
            for index, (href, title) in enumerate(zip(tree_chapter.xpath('//*[@id="chapterList2"]/ul/li/a/@href')[::-1],
                                                      tree_chapter.xpath('//*[@id="chapterList2"]/ul/li/a/@title')[
                                                      ::-1])):
                chapter_url = ' '.join(('http://m.pufei.net', href))
                r_str = r'[\ / : *? "< > |]'
                title = re.sub(r_str, "", title)
                dir_name = f"{index}_{title}"
                dir_path = os.path.join(os.getcwd(), self.keyboard, dir_name)
                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                self.chapter(chapter_url, dir_path)
                logging.info(f"{dir_path}, {chapter_url}")
        except IndexError as e:
            logging.error(result_chapter.text)
            logging.error(e)
            return

    def chapter(self, chapter_url, dir_path):
        result_chapter = requests.get(url=chapter_url, headers=self.headers)
        result_chapter.encoding = 'gb2312'
        cp = re.findall(r'cp="\w+.*"', result_chapter.text)[0][4:-1]
        page_list = self.page_url_list(cp)
        for index, value in enumerate(page_list):
            page_url = "http://res.img.pufei.net/" + value
            file_path = os.path.join(dir_path, f"{index}.jpg")
            ifnot os.path.exists(file_path): Threading.thread (target=self. Page, args=(page_url, file_path)).start() def page(self, page_url, file_path). file_path): try: result_page = requests.get(url=page_url, headers=self.headers) with open(file_path,'wb') as fp:
                fp.write(result_page.content)
            logging.info(file_path)
        except Exception as e:
            logging.error(f"{page_url} - {e}")

    @staticmethod
    def page_url_list(cp):
        js_code = ' '' function base64decode(str) { var base64EncodeChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; var base64DecodeChars = new Array(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 62, 1, 1, 1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 1, 1, 1, 1, 1, 1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 1, 1, 1, 1, 1); var c1, c2, c3, c4; var i, len, out; len = str.length; i = 0; out = ""; while (i < len) { do { c1 = base64DecodeChars[str.charCodeAt(i++) & 255] } while (i < len && c1 == -1); if (c1 == -1) { break } do { c2 = base64DecodeChars[str.charCodeAt(i++) & 255] } while (i < len && c2 == -1); if (c2 == -1) { break } out += String.fromCharCode((c1 << 2) | ((c2 & 48) >> 4)); do { c3 = str.charCodeAt(i++) & 255; if (c3 == 61) { return out } c3 = base64DecodeChars[c3] } while (i < len && c3 == -1); if (c3 == -1) { break } out += String.fromCharCode(((c2 & 15) << 4) | ((c3 & 60) >> 2)); do { c4 = str.charCodeAt(i++) & 255; if (c4 == 61) { return out } c4 = base64DecodeChars[c4] } while (i < len && c4 == -1); if (c4 == -1) { break } out += String.fromCharCode(((c3 & 3) << 6) | c4) } return out } function geturl(cp) { value = eval(eval(base64decode(cp).slice(4))); return value } '' '
        js_context = execjs.compile(js_code)
        return js_context.call('geturl', cp)


if __name__ == '__main__':
    PuFei(input('Please enter comic name:')).search()

Copy the code

Pufei comics web crawler, complete code attached

Making the address

The crawler code

Related Posts

The interviewer asks about the pros and cons of the four major PHP frameworks.

Blockchain technology Workshop – Offline blockchain technology sharing

Problems encountered in first contact with React Native