Recently to help a friend to write a simple crawler, incidentally arranged the next, made a GUI interface with the novel crawler tool, used to crawl from pen fun pavilion novels.
After the development of the interface
Acquisition Process interface
Post-collection storage
Main functions implemented
-
Multi-thread collection, one thread collection of a novel
-
Support the use of proxy, especially multi-threaded collection, without proxy may block IP
- Output the collection result in real time
Using threading. BoundedSemaphore () pool_sema. Acquire () pool_sema. Release () to limit the number of threads to prevent concurrent line bout. The specific limit can be entered on the software interface. The default limit is 5 threads
Before the start of all threads task pool_sema. Threading. BoundedSemaphore (5) the concrete before the start of each thread lock pool_sema. Acquire ()... Pol_sema.release ()Copy the code
Third party modules used
pip install requests
pip install pysimplegui
pip install lxml
pip install pyinstaller
Copy the code
GUI interface using a Tkinter package library PySimpleGUI, very convenient to use, although the interface is not beautiful, but win in simple, very suitable for the development of small tools. Pysimplegui. Readthedocs. IO/en/latest/such as the layout of the interface, a few simple list
layout = [
[sg.Text('Enter the url of the novel you want to climb, click here to open biquge site copy', font=(Microsoft Yahei.12),
key="openwebsite", enable_events=True, tooltip="Click open in browser")],
[sg.Text("Fiction table of Contents page URL, one in a row :")],
[
sg.Multiline(' ', key="url", size=(120.6), autoscroll=True, expand_x=True, right_click_menu=['&Right'['paste']]
)
],
[sg.Text(visible=False, text_color="#ff0000", key="error")],
[
sg.Button(button_text='Start collecting', key="start", size=(20.1)),
sg.Button(button_text='Open download directory', key="opendir",
size=(20.1), button_color="# 999999")
],
[sg.Text('Enter the IP proxy in password format username: password@ip: port, no password format IP: port. As the demo: 123456, 8580: @123.1.2.8 ')],
[
sg.Input(' ', key="proxy"),
sg.Text('Thread count :'),
sg.Input('5', key="threadnum"),
],
[
sg.Multiline('Waiting for collection', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(
120.6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体".10), text_color="# 999999")],]Copy the code
Package as exe command
pyinstaller -Fw start.py
Copy the code
All the source code
import time
import requests
import os
import sys
import re
import random
from lxml import etree
import webbrowser
import PySimpleGUI as sg
import threading
# user-agent
header = {
"User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}
# agent
proxies = {}
# Remove special characters from titles
# Biquge base address
baseurl = 'https://www.xbiquwx.la/'
# number of threads
threadNum = 6
pool_sema = None
THREAD_EVENT = '-THREAD-'
cjstatus = False
# TXT Store directory
filePath = os.path.abspath(os.path.join(os.getcwd(), 'txt'))
if not os.path.exists(filePath):
os.mkdir(filePath)
# Delete special characters
def deletetag(text) :
return re.sub(R '[\ [\] # \ / \ \ : * \,; \? \ "\" < > \ | \ (\) "&" \ ^! ~ = % \ {\} @! :. ·! RMB...... ()]'.' ',text)
# entry
def main() :
global cjstatus, proxies, threadNum, pool_sema
sg.theme("reddit")
layout = [
[sg.Text('Enter the url of the novel you want to climb, click here to open biquge site copy', font=(Microsoft Yahei.12),
key="openwebsite", enable_events=True, tooltip="Click open in browser")],
[sg.Text("Fiction table of Contents page URL, one in a row :")],
[
sg.Multiline(' ', key="url", size=(120.6), autoscroll=True, expand_x=True, right_click_menu=['&Right'['paste']]
)
],
[sg.Text(visible=False, text_color="#ff0000", key="error")],
[
sg.Button(button_text='Start collecting', key="start", size=(20.1)),
sg.Button(button_text='Open download directory', key="opendir",
size=(20.1), button_color="# 999999")
],
[sg.Text('Enter the IP proxy in password format username: password@ip: port, no password format IP: port. As the demo: 123456, 8580: @123.1.2.8 ')],
[
sg.Input(' ', key="proxy"),
sg.Text('Thread count :'),
sg.Input('5', key="threadnum"),
],
[
sg.Multiline('Waiting for collection', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(
120.6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体".10), text_color="# 999999")
],
]
window = sg.Window('Collection of Biquge novels', layout, size=(800.500), resizable=True.)while True:
event, values = window.read()
if event == sg.WIN_CLOSED or event == 'close': # if user closes window or clicks cancel
break
if event == "openwebsite":
webbrowser.open('%s' % baseurl)
elif event == 'opendir':
os.system('start explorer ' + filePath)
elif event == 'start':
if cjstatus:
cjstatus = False
window['start'].update('Has stopped... Click Restart ')
continue
window['error'].update("", visible=False)
urls = values['url'].strip().split("\n")
lenth = len(urls)
for k, url in enumerate(urls):
if (not re.match(r'%s\d+_\d+/' % baseurl, url.strip())):
if len(url.strip()) > 0:
window['error'].update("Address error :%s" % url, visible=True)
del urls[k]
if len(urls) < 1:
window['error'].update(
"Each line of address must conform to % S84_84370 / form" % baseurlr, visible=True)
continue
# agent
if len(values['proxy') >8:
proxies = {
"http": "http://%s" % values['proxy']."https": "http://%s" % values['proxy']}# number of threads
if values['threadnum'] and int(values['threadnum') >0:
threadNum = int(values['threadnum'])
pool_sema = threading.BoundedSemaphore(threadNum)
cjstatus = True
window['start'].update('In collection... Click Stop ')
window['res'].update('Start collecting')
for url in urls:
threading.Thread(target=downloadbybook, args=(
url.strip(), window,), daemon=True).start()
elif event == "Paste":
window['url'].update(sg.clipboard_get())
print("event", event)
if event == THREAD_EVENT:
strtext = values[THREAD_EVENT][1]
window['res'].update(window['res'].get()+"\n"+strtext)
cjstatus = False
window.close()
# download
def downloadbybook(page_url, window) :
try:
bookpage = requests.get(url=page_url, headers=header, proxies=proxies)
except Exception as e:
window.write_event_value(
'-THREAD-', (threading.current_thread().name, '\n request %s error, cause :%s' % (page_url, e)))
return
if not cjstatus:
return
# thread lock
pool_sema.acquire()
ifbookpage.status_code ! =200:
window.write_event_value(
'-THREAD-', (threading.current_thread().name, '\n request %s error, cause :%s' % (page_url, page.reason)))
return
bookpage.encoding = 'utf-8'
page_tree = etree.HTML(bookpage.text)
bookname = page_tree.xpath('//div[@id="info"]/h1/text()') [0]
bookfilename = filePath + '/' + deletetag(bookname)+'.txt'
zj_list = page_tree.xpath(
'//div[@class="box_con"]/div[@id="list"]/dl/dd')
for _ in zj_list:
if not cjstatus:
break
zjurl = page_url + _.xpath('./a/@href') [0]
zjname = _.xpath('./a/@title') [0]
try:
zjpage = requests.get(
zjurl, headers=header, proxies=proxies)
except Exception as e:
window.write_event_value('-THREAD-', (threading.current_thread(
).name, '\n Request %s:%s error, cause :%s' % (zjname, zjurl, zjpage.reason)))
continue
ifzjpage.status_code ! =200:
window.write_event_value('-THREAD-', (threading.current_thread(
).name, '\n Request %s:%s error, cause :%s' % (zjname, zjurl, zjpage.reason)))
return
zjpage.encoding = 'utf-8'
zjpage_content = etree.HTML(zjpage.text).xpath('//div[@id="content"]/text()')
content = "\ n" "+zjname+"】 \ n"
for _ in zjpage_content:
content += _.strip() + '\n'
with open(bookfilename, 'a+', encoding='utf-8') as fs:
fs.write(content)
window.write_event_value(
'-THREAD-', (threading.current_thread().name, '\n%s:%s Collection succeeded ' % (bookname, zjname)))
time.sleep(random.uniform(0.05.0.2))
# Download complete
window.write_event_value('-THREAD-', (threading.current_thread(
).name, '\n Request %s end ' % page_url))
pool_sema.release()
if __name__ == '__main__':
main()
Copy the code
The resources
- Multi-threaded concurrency docs.python.org/zh-cn/3/lib…
- PySimpleGUI pysimplegui.readthedocs.io/en/latest/
- Requests docs.python-requests.org/zh_CN/lates…