Recently to help a friend to write a simple crawler, incidentally arranged the next, made a GUI interface with the novel crawler tool, used to crawl from pen fun pavilion novels.

After the development of the interface

Acquisition Process interface

Post-collection storage

Main functions implemented

  1. Multi-thread collection, one thread collection of a novel

  2. Support the use of proxy, especially multi-threaded collection, without proxy may block IP

  1. Output the collection result in real time

Using threading. BoundedSemaphore () pool_sema. Acquire () pool_sema. Release () to limit the number of threads to prevent concurrent line bout. The specific limit can be entered on the software interface. The default limit is 5 threads

Before the start of all threads task pool_sema. Threading. BoundedSemaphore (5) the concrete before the start of each thread lock pool_sema. Acquire ()... Pol_sema.release ()Copy the code

Third party modules used

pip install requests
pip install pysimplegui
pip install lxml
pip install pyinstaller

Copy the code

GUI interface using a Tkinter package library PySimpleGUI, very convenient to use, although the interface is not beautiful, but win in simple, very suitable for the development of small tools. Pysimplegui. Readthedocs. IO/en/latest/such as the layout of the interface, a few simple list

layout = [
        [sg.Text('Enter the url of the novel you want to climb, click here to open biquge site copy', font=(Microsoft Yahei.12),
                 key="openwebsite", enable_events=True, tooltip="Click open in browser")],
        [sg.Text("Fiction table of Contents page URL, one in a row :")],
        [
            sg.Multiline(' ', key="url", size=(120.6), autoscroll=True, expand_x=True, right_click_menu=['&Right'['paste']]
                         )
        ],
        [sg.Text(visible=False, text_color="#ff0000", key="error")],
        [
            sg.Button(button_text='Start collecting', key="start", size=(20.1)),
            sg.Button(button_text='Open download directory', key="opendir",
                      size=(20.1), button_color="# 999999")
        ],
        [sg.Text('Enter the IP proxy in password format username: password@ip: port, no password format IP: port. As the demo: 123456, 8580: @123.1.2.8 ')],
        [
            sg.Input(' ', key="proxy"),
            sg.Text('Thread count :'),
            sg.Input('5', key="threadnum"),
        ],
        [
            sg.Multiline('Waiting for collection', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(
                120.6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体".10), text_color="# 999999")],]Copy the code

Package as exe command

pyinstaller -Fw start.py

Copy the code

All the source code

import time
import requests
import os
import sys
import re
import random
from lxml import etree
import webbrowser
import PySimpleGUI as sg
import threading


# user-agent
header = {
    "User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}
# agent
proxies = {}
# Remove special characters from titles
# Biquge base address
baseurl = 'https://www.xbiquwx.la/'
# number of threads
threadNum = 6
pool_sema = None
THREAD_EVENT = '-THREAD-'
cjstatus = False

# TXT Store directory
filePath = os.path.abspath(os.path.join(os.getcwd(), 'txt'))
if not os.path.exists(filePath):
    os.mkdir(filePath)

# Delete special characters
def deletetag(text) :
    return re.sub(R '[\ [\] # \ / \ \ : * \,; \? \ "\" < > \ | \ (\) "&" \ ^! ~ = % \ {\} @! :. ·! RMB...... ()]'.' ',text)

# entry
def main() :
    global cjstatus, proxies, threadNum, pool_sema
    sg.theme("reddit")
    layout = [
        [sg.Text('Enter the url of the novel you want to climb, click here to open biquge site copy', font=(Microsoft Yahei.12),
                 key="openwebsite", enable_events=True, tooltip="Click open in browser")],
        [sg.Text("Fiction table of Contents page URL, one in a row :")],
        [
            sg.Multiline(' ', key="url", size=(120.6), autoscroll=True, expand_x=True, right_click_menu=['&Right'['paste']]
                         )
        ],
        [sg.Text(visible=False, text_color="#ff0000", key="error")],
        [
            sg.Button(button_text='Start collecting', key="start", size=(20.1)),
            sg.Button(button_text='Open download directory', key="opendir",
                      size=(20.1), button_color="# 999999")
        ],
        [sg.Text('Enter the IP proxy in password format username: password@ip: port, no password format IP: port. As the demo: 123456, 8580: @123.1.2.8 ')],
        [
            sg.Input(' ', key="proxy"),
            sg.Text('Thread count :'),
            sg.Input('5', key="threadnum"),
        ],
        [
            sg.Multiline('Waiting for collection', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(
                120.6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体".10), text_color="# 999999")
        ],
    ]
    window = sg.Window('Collection of Biquge novels', layout, size=(800.500), resizable=True.)while True:
        event, values = window.read()
        if event == sg.WIN_CLOSED or event == 'close':  # if user closes window or clicks cancel
            break
        if event == "openwebsite":
            webbrowser.open('%s' % baseurl)
        elif event == 'opendir':
            os.system('start explorer ' + filePath)
        elif event == 'start':
            if cjstatus:
                cjstatus = False
                window['start'].update('Has stopped... Click Restart ')
                continue
            window['error'].update("", visible=False)
            urls = values['url'].strip().split("\n")
            lenth = len(urls)
            for k, url in enumerate(urls):
                if (not re.match(r'%s\d+_\d+/' % baseurl, url.strip())):
                    if len(url.strip()) > 0:
                        window['error'].update("Address error :%s" % url, visible=True)
                    del urls[k]

            if len(urls) < 1:
                window['error'].update(
                    "Each line of address must conform to % S84_84370 / form" % baseurlr, visible=True)
                continue
            # agent
            if len(values['proxy') >8:
                proxies = {
                    "http": "http://%s" % values['proxy']."https": "http://%s" % values['proxy']}# number of threads
            if values['threadnum'] and int(values['threadnum') >0:
                threadNum = int(values['threadnum'])
            pool_sema = threading.BoundedSemaphore(threadNum)
            cjstatus = True
            window['start'].update('In collection... Click Stop ')
            window['res'].update('Start collecting')

            for url in urls:
                threading.Thread(target=downloadbybook, args=(
                    url.strip(), window,), daemon=True).start()
        elif event == "Paste":
            window['url'].update(sg.clipboard_get())

        print("event", event)
        if event == THREAD_EVENT:
            strtext = values[THREAD_EVENT][1]
            window['res'].update(window['res'].get()+"\n"+strtext)
    cjstatus = False
    window.close()

# download
def downloadbybook(page_url, window) :
    try:
        bookpage = requests.get(url=page_url, headers=header, proxies=proxies)
    except Exception as e:
        window.write_event_value(
            '-THREAD-', (threading.current_thread().name, '\n request %s error, cause :%s' % (page_url, e)))
        return
    if not cjstatus:
        return
    # thread lock
    pool_sema.acquire()

    ifbookpage.status_code ! =200:
        window.write_event_value(
            '-THREAD-', (threading.current_thread().name, '\n request %s error, cause :%s' % (page_url, page.reason)))
        return

    bookpage.encoding = 'utf-8'
    page_tree = etree.HTML(bookpage.text)
    bookname = page_tree.xpath('//div[@id="info"]/h1/text()') [0]
    bookfilename = filePath + '/' + deletetag(bookname)+'.txt'
    zj_list = page_tree.xpath(
        '//div[@class="box_con"]/div[@id="list"]/dl/dd')
    for _ in zj_list:
        if not cjstatus:
            break
        zjurl = page_url + _.xpath('./a/@href') [0]
        zjname = _.xpath('./a/@title') [0]
        try:
            zjpage = requests.get(
                zjurl, headers=header, proxies=proxies)
        except Exception as e:
            window.write_event_value('-THREAD-', (threading.current_thread(
            ).name, '\n Request %s:%s error, cause :%s' % (zjname, zjurl, zjpage.reason)))
            continue

        ifzjpage.status_code ! =200:
            window.write_event_value('-THREAD-', (threading.current_thread(
            ).name, '\n Request %s:%s error, cause :%s' % (zjname, zjurl, zjpage.reason)))
            return 
        
        zjpage.encoding = 'utf-8'
        zjpage_content = etree.HTML(zjpage.text).xpath('//div[@id="content"]/text()')
        content = "\ n" "+zjname+"】 \ n"
        for _ in zjpage_content:
            content += _.strip() + '\n'
        with open(bookfilename, 'a+', encoding='utf-8') as fs:
            fs.write(content)
            window.write_event_value(
                '-THREAD-', (threading.current_thread().name, '\n%s:%s Collection succeeded ' % (bookname, zjname)))
        time.sleep(random.uniform(0.05.0.2))

    # Download complete
    window.write_event_value('-THREAD-', (threading.current_thread(
    ).name, '\n Request %s end ' % page_url))
    pool_sema.release()


if __name__ == '__main__':
    main()

Copy the code

The resources

  1. Multi-threaded concurrency docs.python.org/zh-cn/3/lib…
  2. PySimpleGUI pysimplegui.readthedocs.io/en/latest/
  3. Requests docs.python-requests.org/zh_CN/lates…