Recently to help a friend to write a simple crawler, incidentally arranged the next, made a GUI interface with the novel crawler tool, used to crawl from pen fun pavilion novels.

After the development of the interface

Acquisition Process interface

Post-collection storage

Main functions implemented

  1. Multi-thread collection, one thread collection of a novel

  2. Support the use of proxy, especially multi-threaded collection, without proxy may block IP

  1. Output the collection result in real time

Using threading. BoundedSemaphore () pool_sema. Acquire () pool_sema. Release () to limit the number of threads to prevent concurrent line bout. The specific limit can be entered on the software interface. The default limit is 5 threads

Before the start of all threads task pool_sema. Threading. BoundedSemaphore (5) the concrete before the start of each thread lock pool_sema. Acquire ()... Pol_sema.release ()Copy the code

Third party modules used

pip install requests
pip install pysimplegui
pip install lxml
pip install pyinstaller

GUI interface using a Tkinter package library PySimpleGUI, very convenient to use, although the interface is not beautiful, but win in simple, very suitable for the development of small tools. Pysimplegui. Readthedocs. IO/en/latest/such as the layout of the interface, a few simple list

layout = [
        [sg.Text('Enter the url of the novel you want to climb, click here to open biquge site copy', font=(Microsoft Yahei.12),
                 key="openwebsite", enable_events=True, tooltip="Click open in browser")],
        [sg.Text("Fiction table of Contents page URL, one in a row :")],
            sg.Multiline(' ', key="url", size=(120.6), autoscroll=True, expand_x=True, right_click_menu=['&Right'['paste']]
        [sg.Text(visible=False, text_color="#ff0000", key="error")],
            sg.Button(button_text='Start collecting', key="start", size=(20.1)),
            sg.Button(button_text='Open download directory', key="opendir",
                      size=(20.1), button_color="# 999999")
        [sg.Text('Enter the IP proxy in password format username: password@ip: port, no password format IP: port. As the demo: 123456, 8580: @ ')],
            sg.Input(' ', key="proxy"),
            sg.Text('Thread count :'),
            sg.Input('5', key="threadnum"),
            sg.Multiline('Waiting for collection', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(
                120.6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体".10), text_color="# 999999")],]

Package as exe command

pyinstaller -Fw

All the source code

import time
import requests
import os
import sys
import re
import random
from lxml import etree
import webbrowser
import PySimpleGUI as sg
import threading

# user-agent
header = {
    "User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
# agent
proxies = {}
# Remove special characters from titles
# Biquge base address
baseurl = ''
# number of threads
threadNum = 6
pool_sema = None
cjstatus = False

# TXT Store directory
filePath = os.path.abspath(os.path.join(os.getcwd(), 'txt'))
if not os.path.exists(filePath):

# Delete special characters
def deletetag(text) :
    return re.sub(R '[\ [\] # \ / \ \ : * \,; \? \ "\" < > \ | \ (\) "&" \ ^! ~ = % \ {\} @! :. ·! RMB...... ()]'.' ',text)

# entry
def main() :
    global cjstatus, proxies, threadNum, pool_sema
    layout = [
        [sg.Text('Enter the url of the novel you want to climb, click here to open biquge site copy', font=(Microsoft Yahei.12),
                 key="openwebsite", enable_events=True, tooltip="Click open in browser")],
        [sg.Text("Fiction table of Contents page URL, one in a row :")],
            sg.Multiline(' ', key="url", size=(120.6), autoscroll=True, expand_x=True, right_click_menu=['&Right'['paste']]
        [sg.Text(visible=False, text_color="#ff0000", key="error")],
            sg.Button(button_text='Start collecting', key="start", size=(20.1)),
            sg.Button(button_text='Open download directory', key="opendir",
                      size=(20.1), button_color="# 999999")
        [sg.Text('Enter the IP proxy in password format username: password@ip: port, no password format IP: port. As the demo: 123456, 8580: @ ')],
            sg.Input(' ', key="proxy"),
            sg.Text('Thread count :'),
            sg.Input('5', key="threadnum"),
            sg.Multiline('Waiting for collection', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(
                120.6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体".10), text_color="# 999999")
    window = sg.Window('Collection of Biquge novels', layout, size=(800.500), resizable=True.)while True:
        event, values =
        if event == sg.WIN_CLOSED or event == 'close':  # if user closes window or clicks cancel
        if event == "openwebsite":
  '%s' % baseurl)
        elif event == 'opendir':
            os.system('start explorer ' + filePath)
        elif event == 'start':
            if cjstatus:
                cjstatus = False
                window['start'].update('Has stopped... Click Restart ')
            window['error'].update("", visible=False)
            urls = values['url'].strip().split("\n")
            lenth = len(urls)
            for k, url in enumerate(urls):
                if (not re.match(r'%s\d+_\d+/' % baseurl, url.strip())):
                    if len(url.strip()) > 0:
                        window['error'].update("Address error :%s" % url, visible=True)
                    del urls[k]

            if len(urls) < 1:
                    "Each line of address must conform to % S84_84370 / form" % baseurlr, visible=True)
            # agent
            if len(values['proxy') >8:
                proxies = {
                    "http": "http://%s" % values['proxy']."https": "http://%s" % values['proxy']}# number of threads
            if values['threadnum'] and int(values['threadnum') >0:
                threadNum = int(values['threadnum'])
            pool_sema = threading.BoundedSemaphore(threadNum)
            cjstatus = True
            window['start'].update('In collection... Click Stop ')
            window['res'].update('Start collecting')

            for url in urls:
                threading.Thread(target=downloadbybook, args=(
                    url.strip(), window,), daemon=True).start()
        elif event == "Paste":

        print("event", event)
        if event == THREAD_EVENT:
            strtext = values[THREAD_EVENT][1]
    cjstatus = False

# download
def downloadbybook(page_url, window) :
        bookpage = requests.get(url=page_url, headers=header, proxies=proxies)
    except Exception as e:
            '-THREAD-', (threading.current_thread().name, '\n request %s error, cause :%s' % (page_url, e)))
    if not cjstatus:
    # thread lock

    ifbookpage.status_code ! =200:
            '-THREAD-', (threading.current_thread().name, '\n request %s error, cause :%s' % (page_url, page.reason)))

    bookpage.encoding = 'utf-8'
    page_tree = etree.HTML(bookpage.text)
    bookname = page_tree.xpath('//div[@id="info"]/h1/text()') [0]
    bookfilename = filePath + '/' + deletetag(bookname)+'.txt'
    zj_list = page_tree.xpath(
    for _ in zj_list:
        if not cjstatus:
        zjurl = page_url + _.xpath('./a/@href') [0]
        zjname = _.xpath('./a/@title') [0]
            zjpage = requests.get(
                zjurl, headers=header, proxies=proxies)
        except Exception as e:
            window.write_event_value('-THREAD-', (threading.current_thread(
            ).name, '\n Request %s:%s error, cause :%s' % (zjname, zjurl, zjpage.reason)))

        ifzjpage.status_code ! =200:
            window.write_event_value('-THREAD-', (threading.current_thread(
            ).name, '\n Request %s:%s error, cause :%s' % (zjname, zjurl, zjpage.reason)))
        zjpage.encoding = 'utf-8'
        zjpage_content = etree.HTML(zjpage.text).xpath('//div[@id="content"]/text()')
        content = "\ n" "+zjname+"】 \ n"
        for _ in zjpage_content:
            content += _.strip() + '\n'
        with open(bookfilename, 'a+', encoding='utf-8') as fs:
                '-THREAD-', (threading.current_thread().name, '\n%s:%s Collection succeeded ' % (bookname, zjname)))

    # Download complete
    window.write_event_value('-THREAD-', (threading.current_thread(
    ).name, '\n Request %s end ' % page_url))

if __name__ == '__main__':

