1. Basic steps

1, guide package

import os

import time

import random

import shutil

import re

import queue

import threading

import requests

from lxml import etree

2. Define global variables

path = os.path.join(os.path.dirname(__file__), ‘doutula’)

headers = {

‘the user-agent’ : ‘Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,

}

Define a producer to get the image address

class Procuder(threading.Thread):

“” “

Define a producer (grabs an image URL on a web page)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Procuder, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.page_queue.empty():

break

Get the URL from the queue and add it to the method

url = self.page_queue.get()

self.parse_page(url)

def parse_page(self, url):

“” “

Definition get all image URL address, append to the image queue

:param url:

:return:

“” “

response = requests.get(url=url, headers=headers)

time.sleep(random.randrange(2))

if response.status_code == 200:

html = etree.HTML(response.text)

imgs = html.xpath(‘//div[@class=”random_article”]/div[@class=”col-xs-6 col-sm-3″]/img’)

for img in imgs:

if img.get(‘class’) == ‘gif’:

continue

img_url = img.xpath(“./@data-original”)[0]

alt = img.xpath(‘./@alt’)[0]

# regular replaces special characters in Chinese

Name = re.sub(re.compile(‘[,.\.\+]’), ‘, Alt)

# Use the OS module splitext to intercept the file suffix

filename = name + os.path.splitext(img_url)[1]

# fetch to add to image queue

self.img_queue.put((img_url, filename))

4. Define a consumer download image

class Consumer(threading.Thread):

“” “

Define a consumer class (download the image URL from the producer)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Consumer, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.img_queue.empty() and self.page_queue.empty():

break

Get a value from the queue

img_url, filename = self.img_queue.get()

time.sleep(random.randrange(2))

response = requests.get(url=img_url, headers=headers)

Print (‘ downloading ==>’, img_url, ‘=== =’, filename)

with open(os.path.join(path, filename), ‘wb’) as fp:

fp.write(response.content)

5. Define a main() main runtime method

def create_dir():

“” “

Define a method to create a folder (delete it if it exists and then delete it if it does not exist)

:return:

“” “

if os.path.exists(path):

shutil.rmtree(path)

os.makedirs(path)

else:

os.makedirs(path)

def main():

“” “

Create a function to run

:return:

“” “

Create a folder first

create_dir()

page_queue = queue.Queue(100)

img_queue = queue.Queue(1000)

# Download images for 1-5 pages

for x in range(1, 5):

url = ‘http://www.doutula.com/article/list/?page={0}’.format(x)

page_queue.put(url)

Start 5 producer and 5 consumer threads

for x in range(5):

c = Consumer(page_queue, img_queue)

p = Procuder(page_queue, img_queue)

c.start()

p.start()

if __name__ == ‘__main__’:

main()

Two, all the code

import os

import time

import random

import shutil

import re

import queue

import threading

import requests

from lxml import etree

path = os.path.join(os.path.dirname(__file__), ‘doutula’)

headers = {

‘the user-agent’ : ‘Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36’,

}

class Procuder(threading.Thread):

“” “

Define a producer (grabs an image URL on a web page)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Procuder, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.page_queue.empty():

break

Get the URL from the queue and add it to the method

url = self.page_queue.get()

self.parse_page(url)

def parse_page(self, url):

“” “

Definition get all image URL address, append to the image queue

:param url:

:return:

“” “

response = requests.get(url=url, headers=headers)

time.sleep(random.randrange(2))

if response.status_code == 200:

html = etree.HTML(response.text)

imgs = html.xpath(‘//div[@class=”random_article”]/div[@class=”col-xs-6 col-sm-3″]/img’)

for img in imgs:

if img.get(‘class’) == ‘gif’:

continue

img_url = img.xpath(“./@data-original”)[0]

alt = img.xpath(‘./@alt’)[0]

# regular replaces special characters in Chinese

Name = re.sub(re.compile(‘[,.\.\+]’), ‘, Alt)

# Use the OS module splitext to intercept the file suffix

filename = name + os.path.splitext(img_url)[1]

# fetch to add to image queue

self.img_queue.put((img_url, filename))

class Consumer(threading.Thread):

“” “

Define a consumer class (download the image URL from the producer)

“” “

def __init__(self, page_queue, img_queue, *args, **kwargs):

super(Consumer, self).__init__(*args, **kwargs)

self.page_queue = page_queue

self.img_queue = img_queue

def run(self):

while True:

if self.img_queue.empty() and self.page_queue.empty():

break

Get a value from the queue

img_url, filename = self.img_queue.get()

time.sleep(random.randrange(2))

response = requests.get(url=img_url, headers=headers)

Print (‘ downloading ==>’, img_url, ‘=== =’, filename)

with open(os.path.join(path, filename), ‘wb’) as fp:

fp.write(response.content)

def create_dir():

“” “

Define a method to create a folder (delete it if it exists and then delete it if it does not exist)

:return:

“” “

if os.path.exists(path):

shutil.rmtree(path)

os.makedirs(path)

else:

os.makedirs(path)

def main():

“” “

Create a function to run

:return:

“” “

Create a folder first

create_dir()

page_queue = queue.Queue(100)

img_queue = queue.Queue(1000)

# Download images for 1-5 pages

for x in range(1, 5):

url = ‘http://www.doutula.com/article/list/?page={0}’.format(x)

page_queue.put(url)

Start 5 producer and 5 consumer threads

for x in range(5):

c = Consumer(page_queue, img_queue)

p = Procuder(page_queue, img_queue)

c.start()

p.start()

if __name__ == ‘__main__’:

main()