preface

I’d better do some solid work today, and then I haven’t brushed the algorithm for this week.

The target site

Analysis of the

Ok, now that the target site is identified, how do we analyze the crawl? Our goal is to crawl N pages

paging

First click on the next page we found this phenomenon the https://pic.netbian.com/index_2.html has changed, and it is regular, so we directly extracted template https://pic.netbian.com/index_ {}. The HTML

Get page image

We find all the images in that path and click in

There’s another layer inside.

So here’s the picture.

coding

Now that we understand the process, we can just code it.

The project structure

That’s it. It’s very simple. First is our reptile.

The crawler written

import scrapy
from dome.items import DomeItem

class FirstSpider(scrapy.Spider) :
    name = 'first'
    baseUrl="https://pic.netbian.com/index_{}.html"
    path_index = 2
    max_page = 10 # We can crawl up to 10 pages of data
    # allowed_domains = ['www.baidu.com']
    start_urls = ['https://pic.netbian.com/index.html'] # page
    domains = "https://pic.netbian.com"

    def parse(self, response) :

        li_imgages = response.xpath('//*[@id="main"]/div[3]/ul/li')

        for li in li_imgages:
            link_image =self.domains + li.xpath("./a/@href").extract_first()
            image_name = li.xpath("./a/@title | ./a/img/@alt").extract_first()
            print(image_name)
            Enter that subpage in our section
            item = DomeItem()
            item['image_name'] = image_name
            headers = {'referer':link_image}
            yield scrapy.Request(link_image,callback=self.parse_image,meta={'item':item},headers=headers)

        if(self.path_index <= self.max_page):
            url = self.baseUrl.format(self.path_index)
            self.path_index+=1
            print(url,"Crawl complete")
            yield scrapy.Request(url,callback=self.parse)

    def parse_image(self,response) :
        item = response.meta['item']
        image_src =  self.domains + response.xpath('//*[@id="img"]/img/@src').extract_first()
        item['image_src'] = image_src

        yield item
Copy the code

The item to write

There are only two words in this one, the image name and the URL

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DomeItem(scrapy.Item) :
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_name = scrapy.Field()
    image_src = scrapy.Field()
Copy the code

Save and set UA

It is then handed over to the pipeline for download

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

Open_spider close_spider return item Passes the next piple that needs to be executed.
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DomePipeline(ImagesPipeline) :

    def get_media_requests(self, item, info) :
        # request image data
        yield scrapy.Request(item['image_src'])

    def file_path(self, request, response=None, info=None, *, item=None) :
        return item['image_name'] +'.jpg'

    def item_completed(self, results, item, info) :
        'Pass it to the next one, if you have another piple.'
        return item
Copy the code

Note that the save path is set in the configuration file

And headers, of course, that’s what I’m dealing with directly in the middleware. Of course, you can also pass parameters directly

This is the reason

In this case, I’m just going to give you my Settings file

# Scrapy settings for dome project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'dome'

SPIDER_MODULES = ['dome.spiders']
NEWSPIDER_MODULE = 'dome.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
LOG_LEVEL = 'ERROR'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
IMAGES_STORE='/'
USER_AGENT_LIST=[
    "Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"."Mozilla / 5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11"."Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6".Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6"."Mozilla / 5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1"."Mozilla / 5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5"."Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5"."Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"."Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"."Mozilla / 4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident / 4.0; SE 2. MetaSr 1.0 X; SE 2. MetaSr 1.0 X; The.net CLR 2.0.50727; SE 2. MetaSr 1.0 X)"."Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"."Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"."Mozilla / 4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"."Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"."Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3"."Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3"."Mozilla / 5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"."Mozilla / 5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]


# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml; Q = 0.9 * / *; Q = 0.8 ',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dome.middlewares.DomeSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'dome.middlewares.DomeDownloaderMiddleware': 543,
#}

DOWNLOADER_MIDDLEWARES = {
   'lagou.middlewares.LagouDownloaderMiddleware': 543,}# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'dome.pipelines.DomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Copy the code

Then find your middleware