Python Crawler Series (5.2- Using multiple threads to download images)

1. Basic steps

1, guide package

import os

import time

import random

import shutil

import re

import queue

import threading

import requests

from lxml import etree

2. Define global variables

path = os.path.join(os.path.dirname(__file__), ‘doutula’)

headers = {

‘the user-agent’ : ‘Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,

}

Define a producer to get the image address

class Procuder(threading.Thread):

“” “

Define a producer (grabs an image URL on a web page)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Procuder, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.page_queue.empty():

break

Get the URL from the queue and add it to the method

url = self.page_queue.get()

self.parse_page(url)

def parse_page(self, url):

“” “

Definition get all image URL address, append to the image queue

:param url:

:return:

“” “

response = requests.get(url=url, headers=headers)

time.sleep(random.randrange(2))

if response.status_code == 200:

html = etree.HTML(response.text)

imgs = html.xpath(‘//div[@class=”random_article”]/div[@class=”col-xs-6 col-sm-3″]/img’)

for img in imgs:

if img.get(‘class’) == ‘gif’:

continue

img_url = img.xpath(“./@data-original”)[0]

alt = img.xpath(‘./@alt’)[0]

# regular replaces special characters in Chinese

Name = re.sub(re.compile(‘[,.\.\+]’), ‘, Alt)

# Use the OS module splitext to intercept the file suffix

filename = name + os.path.splitext(img_url)[1]

# fetch to add to image queue

self.img_queue.put((img_url, filename))

4. Define a consumer download image

class Consumer(threading.Thread):

“” “

Define a consumer class (download the image URL from the producer)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Consumer, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.img_queue.empty() and self.page_queue.empty():

break

Get a value from the queue

img_url, filename = self.img_queue.get()

time.sleep(random.randrange(2))

response = requests.get(url=img_url, headers=headers)

Print (‘ downloading ==>’, img_url, ‘=== =’, filename)

with open(os.path.join(path, filename), ‘wb’) as fp:

fp.write(response.content)

5. Define a main() main runtime method

def create_dir():

“” “

Define a method to create a folder (delete it if it exists and then delete it if it does not exist)

:return:

“” “

if os.path.exists(path):

shutil.rmtree(path)

os.makedirs(path)

else:

os.makedirs(path)

def main():

“” “

Create a function to run

:return:

“” “

Create a folder first

create_dir()

page_queue = queue.Queue(100)

img_queue = queue.Queue(1000)

# Download images for 1-5 pages

for x in range(1, 5):

url = ‘http://www.doutula.com/article/list/?page={0}’.format(x)

page_queue.put(url)

Start 5 producer and 5 consumer threads

for x in range(5):

c = Consumer(page_queue, img_queue)

p = Procuder(page_queue, img_queue)

c.start()

p.start()

if __name__ == ‘__main__’:

main()

Two, all the code

import os

import time

import random

import shutil

import re

import queue

import threading

import requests

from lxml import etree

path = os.path.join(os.path.dirname(__file__), ‘doutula’)

headers = {

‘the user-agent’ : ‘Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,

}

class Procuder(threading.Thread):

“” “

Define a producer (grabs an image URL on a web page)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Procuder, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.page_queue.empty():

break

Get the URL from the queue and add it to the method

url = self.page_queue.get()

self.parse_page(url)

def parse_page(self, url):

“” “

Definition get all image URL address, append to the image queue

:param url:

:return:

“” “

response = requests.get(url=url, headers=headers)

time.sleep(random.randrange(2))

if response.status_code == 200:

html = etree.HTML(response.text)

imgs = html.xpath(‘//div[@class=”random_article”]/div[@class=”col-xs-6 col-sm-3″]/img’)

for img in imgs:

if img.get(‘class’) == ‘gif’:

continue

img_url = img.xpath(“./@data-original”)[0]

alt = img.xpath(‘./@alt’)[0]

# regular replaces special characters in Chinese

Name = re.sub(re.compile(‘[,.\.\+]’), ‘, Alt)

# Use the OS module splitext to intercept the file suffix

filename = name + os.path.splitext(img_url)[1]

# fetch to add to image queue

self.img_queue.put((img_url, filename))

class Consumer(threading.Thread):

“” “

Define a consumer class (download the image URL from the producer)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Consumer, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.img_queue.empty() and self.page_queue.empty():

break

Get a value from the queue

img_url, filename = self.img_queue.get()

time.sleep(random.randrange(2))

response = requests.get(url=img_url, headers=headers)

Print (‘ downloading ==>’, img_url, ‘=== =’, filename)

with open(os.path.join(path, filename), ‘wb’) as fp:

fp.write(response.content)

def create_dir():

“” “

Define a method to create a folder (delete it if it exists and then delete it if it does not exist)

:return:

“” “

if os.path.exists(path):

shutil.rmtree(path)

os.makedirs(path)

else:

os.makedirs(path)

def main():

“” “

Create a function to run

:return:

“” “

Create a folder first

create_dir()

page_queue = queue.Queue(100)

img_queue = queue.Queue(1000)

# Download images for 1-5 pages

for x in range(1, 5):

url = ‘http://www.doutula.com/article/list/?page={0}’.format(x)

page_queue.put(url)

Start 5 producer and 5 consumer threads

for x in range(5):

c = Consumer(page_queue, img_queue)

p = Procuder(page_queue, img_queue)

c.start()

p.start()

if __name__ == ‘__main__’:

main()

Python Crawler Series (5.2- Using multiple threads to download images)

Related Posts

Singly linked list data structure of the delete list node | the fifth set of a given location

Getting started with MQ

Why can [this] call the current instance in Java?