1 scrapy

docs.scrapy.org/en/latest/

Generate scrapy project

scrapy startproject tutorial
Copy the code

2 generate spiders

The spider name must be unique

3 Modify the spider content

import scrapy

# address https://www.zhihu.com/people/woodenrobot/posts?page=2
Scrapy crawl douban_ajax -o douban_ajax.csv

import re
import json


from scrapy import Request
from scrapy.spiders import Spider


class DoubanMovieItem(scrapy.Item):
    Rank #
    ranking = scrapy.Field()
    # Movie title
    movie_name = scrapy.Field()
    # score
    score = scrapy.Field()
    # comment count
    score_num = scrapy.Field()


class DoubanAJAXSpider(Spider):
    name = 'douban_ajax'
    headers = {
        'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    }

    def start_requests(self):
        url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
        yield Request(url, headers=self.headers)

    def parse(self, response):
        datas = json.loads(response.body)
        item = DoubanMovieItem()
        if datas:
            for data in datas:
                item['ranking'] = data['rank']
                item['movie_name'] = data['title']
                item['score'] = data['score']
                item['score_num'] = data['vote_count']
                yield item

            If the datas has data, the next page will be collected
            page_num = re.search(r'start=(\d+)', response.url).group(1)
            page_num = 'start=' + str(int(page_num)+20)
            next_url = re.sub(r'start=\d+', page_num, response.url)
            yield Request(next_url, headers=self.headers)
Copy the code

4 Running Commands

scrapy crawl douban_ajax -o douban_ajax.csv