1. Import the extension libraries required by the project
1# -*- coding: UTF-8 -*-
2
3Import urllib for parameter data encoding
4import urllib
5Import urllib2 to perform the core crawler
6import urllib2
7
8Import UserAgent to generate UA
9from fake_useragent import UserAgent
Copy the code
2. Execute the page request function
 1Execute the web request
 2def req_url(self,full_url):
 3    headers = {
 4        Generate a random user-agent
 5        'User-Agent': self.user_agent.random
 6    }
 7    Construct Request object
 8    request = urllib2.Request(headers=headers, url=full_url)
 9    # execute request
10    response = urllib2.urlopen(request)
11    return response.read()
Copy the code
3, save the HTML source code downloaded from the web page
1Save the HTML source code that you crawl down
2def save_doc(self,html_doc, file_name):
3    print "Start saving file :", file_name
4    with open(file_name, 'w') as f:
5        f.write(html_doc)
6    print "Complete file :", file_name, "Save"
Copy the code
4. Assemble complete crawler address and parameter assembly, etc
 1Construct the crawler environment and execute it
 2def run_spider(self):
 3    for page in range(self.begin, self.end + 1) :4        # Count pages
 5        pn = (page - 1) * 50
 6        # Encode Chinese parameters
 7        name = urllib.urlencode({'kw': self.tieba_name})
 8        # Assemble the complete crawler address
 9        full_url = self.url + name + '&pn=' + str(pn)
10        Define the file name according to the number of pages
11        file_name = str(page) + '.html'
12        Execute crawler web request
13        html_doc = self.req_url(full_url)
14        # save file
15        self.save_doc(html_doc, file_name)
Copy the code
5. User-defined input parameters
1The user enters relevant data
2url = 'http://tieba.baidu.com/f?'
3tieba_name = raw_input('Please enter the name of the post bar you want to climb:')
4begin = int(raw_input('Please enter the start page number:'))
5end = int(raw_input('Please enter closing page number:'))
Copy the code

The enhancement of strength is the most important, enter the public account reply: “post bar crawler source code”, get the post bar crawler source code, go to get the brush question ~