Crawler 11- Scrapy breaks through the anti-crawler strategy

In order to change the IP and user-agent of the scrapy framework, it is necessary to change the crawler middleware

1, the user-agent

PIP install fake-userAgent

Usage:

from

fake_useragent

import

UserAgent
ua = UserAgent()
print(ua.ie)

Running results:

Mozilla/

4.0

(compatible; MSIE

8.0

; Windows NT

6.1

; WOW64; Trident/

4.0

; SLCC2; .NET

CLR

2.0

50727.

; InfoPath

)

Use Internet Explorer, other browsers

# Mozilla / 5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);

ua.msie

# Mozilla / 5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident / 6.0) ‘

ua[‘Internet Explorer’]

# Mozilla / 5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident / 4.0; GTB7.4; InfoPath.2; SV1; .NET

The CLR 3.3.69573; WOW64; en-US)

ua.opera

# Opera/(9.80 X11; Linux i686; U; Ru) Presto / 2.8.131 Version / 11.11

ua.chrome

# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0

Safari / 537.2 ‘

ua.google

# Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/537.13 (KHTML, like Gecko)

Chrome / 24.0.1290.1 Safari / 537.13

ua[‘google chrome’]

# Mozilla / 5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko)

Chrome / 20.0.1132.57 Safari / 536.11

ua.firefox

# Mozilla/5.0 (Windows NT 6.2; Win64; x64; The rv: Gecko / 20121011 Firefox / 16.0.1 16.0.1)

ua.ff

# Mozilla / 5.0 (X11; Ubuntu; Linux i686; The rv: 15.0) Gecko / 20100101 Firefox / 15.0.1

ua.safari

# Mozilla / 5.0 (the device; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko)

Version 6.0 / Mobile / 10 a5355d Safari / 8536.25

# and the best one, random via real world browser usage statistic

ua.random

ua.update()

(2) Set user-agent, modify middleware. Py, add the following code to middleware.py

from

fake_useragent

import

UserAgent
class UserAgentMiddleware(object):
def __init__(self, user_agent=”):
# print(‘==UserAgentMiddleware init==’)
self.ua = UserAgent()
def process_request(self, request, spider):
# print(‘==UserAgentMiddleware process_request==’)
if

self.ua:
# print(‘********Current UserAgent************’)
print(self.ua.random)
request.headers.setdefault(
‘User-agent’

, self.ua.random)

Then modify the DOWNLOADER_MIDDLEWARES in the Settings configuration file

# Original SPIDER_MIDDLEWARES
# DOWNLOADER_MIDDLEWARES = {
# ‘xie.middlewares.XieSpiderMiddleware’: 543,
#}
DOWNLOADER_MIDDLEWARES = {
# ‘xie.middlewares.XieSpiderMiddleware’: 543,
‘xie.middlewares.UserAgentMiddleware’

:

200

.
}

The above code is adapted according to the source code shipped user agent, view the source: site – package/scrapy/downloadermiddlewares/useragent. Py

2. Proxy IP address

The proxy IP address is similar to that of the user-agent. The middleware defines the proxy IP address in.

import

random
from

scrapy

import

signals
class RandomProxyMiddleware(object):
def __init__(self):
self.PROXIES = [
{
‘ip_port’

:

‘111.8.60.9:8123’

.

‘user_passwd’

:

‘user1:pass1’

},
{
‘ip_port’

:

‘101.71.27.120:80’

.

‘user_passwd’

:

‘user2:pass2’

},
{
‘ip_port’

:

‘122.96.59.104:80’

.

‘user_passwd’

:

‘user3:pass3’

},
{
‘ip_port’

:

‘122.224.249.122:8088’

.

‘user_passwd’

:

‘user4:pass4’

},
]
def process_request(self, request, spider):
proxy = random.choice(self.PROXIES)
# Proxy usage without proxy account authentication
if

proxy[

‘user passwd’

]

is

None

:
request.meta[
‘proxy’

] =

‘http://’

+ proxy[

‘ip_port’

]
else

:
Base64 encoding for the account password
base64_userpasswd = base64.b64encode(proxy[
‘user_passwd’

])
request.headers[
‘Proxy-Authorization’

] =

‘Basic ‘

+ base64_userpasswd
request.meta[
‘proxy’

] =

“http://”

+ proxy[

‘ip_port’

]

Then modify the SPIDER_MIDDLEWARES in the Settings configuration file

DOWNLOADER_MIDDLEWARES = {
# ‘xie.middlewares.XieSpiderMiddleware’: 543,
‘xie.middlewares.UserAgentMiddleware’

:

200

.
‘xie.middlewares.RandomProxyMiddleware’

:

300

.
}

3. Get the dynamic page

from

selenium

import

webdriver
from

scrapy.http

import

HtmlResponse
class WebDriverMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return

s
def process_request(self, request, spider):
# load driver
print(
‘================process_request================’

)
browser = webdriver.PhantomJS()
browser.get(request.url)
Load the page
data = browser.page_source
Get the text of the page
data = data.encode(
‘utf-8’

)
browser.quit()
return

HtmlResponse(request.url, body=data, encoding=

‘utf-8’

, request=request)
def process_response(self, request, response, spider):
return

response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info(
‘Spider opened: %s’

% spider.name)

Then modify the SPIDER_MIDDLEWARES in the Settings configuration file

DOWNLOADER_MIDDLEWARES = {
# ‘xie.middlewares.XieSpiderMiddleware’: 543,
‘xie.middlewares.UserAgentMiddleware’

:

200

.
‘xie.middlewares.RandomProxyMiddleware’

:

300

.
‘xie.middlewares.WebDriverMiddleware’

:

400

.
}

Sina blog is a dynamic page, want to get all the information of the page, need to slide the page down…

4. Disable cookies

Prevents certain sites from blocking crawlers based on cookies in special cases.

COOKIES_ENABLED = False

For more free technical information: annalin1203

Crawler 11- Scrapy breaks through the anti-crawler strategy

Related Posts

Use selenium library to do basic anti-crawler, this will not say crawler?

Don’t miss it, come and have a look at the duck!

XML entity injection vulnerability