process

1. Before the Engine receives the request and sends it to the Scheduler, it checks whether the request is filtered

# source location scrapy. Core. Engine. ExecutionEngineclass ExecutionEngine(Object): def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, Spider =spider) calls scheduler's enqueue_request method to make a judgment, as shown in 2if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request, spider=spider)
Copy the code

2. The logic to determine whether the current request is filtered is determined by the Scheduler’s enqueue_request method

# source location scrapy. Core. The scheduler. The scheduler
class Scheduler(object):
    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
                 logunser=False, stats=None, pqclass=None):
        # dupefilter is a specific filter. See 3.self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.pqclass = pqclass self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats ... . . def enqueue_request(self, request):# self.df.request_seen specifies the method logic in the filter to perform the filtering
        If the request is not filtered and the request is determined to be filtered, print the request and return False
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return False
        if self.stats:
            self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
        self.queue.push(request)
        return True
Copy the code

3. Scrapyscrapy/dupefilters.pyIn the

# to restart the base class, which defines the methods that the de-loader needs to implement
class BaseDupeFilter(object):

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        return False

    def open(self):  # can return deferred
        pass

    def close(self, reason):  # can return a deferred
        pass

    def log(self, request, spider):  # log that a request has been filtered
        pass


# scrapy default weight remover
class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        self.file = None
        # A set of fingerprints, using the characteristics of set, do not repeat
        self.fingerprints = set()
        self.logdupes = True
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        Scrapy scrapy scrapy Scrapy Scrapy Scrapy Scrapy Scrapy Scrapy Scrapy
        if path:
            self.file = open(os.path.join(path, 'requests.seen'), 'a+')
            self.file.seek(0)
            self.fingerprints.update(x.rstrip() for x in self.file)

    @classmethod
    def from_settings(cls, settings):
        # Settings Setting DUPEFILTER_DEBUG to true enables debugging filtering
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        Generate a fingerprint for request
        fp = self.request_fingerprint(request)
        Check whether the current fingerprint is in the collection
        if fp in self.fingerprints:
            The request should be filtered if it returns True
            return True
        Otherwise add to set
        self.fingerprints.add(fp)
        # if the jobdir file exists, write
        if self.file:
            self.file.write(fp + os.linesep)

    Request_fingerprint method in scrapy.utils.request,
    # use the SHA1 algorithm to generate a fixed-length hash value for each request
    def request_fingerprint(self, request):
        return request_fingerprint(request)

    # close method
    def close(self, reason):
        if self.file:
            self.file.close()

    # Method encapsulation of logging
    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
Copy the code

request_seen

enqueue_request