1. Douban top250

This is the third day of my participation in Gwen Challenge

# douban top250
Get movie titles, reviews, and images
import requests
from lxml import etree


class DouBan(object) :
    def __init__(self, url) :
        self.url = url
        self.headers = {
            'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

    def parse_url(self) :  # get source code
        response = requests.get(url=self.url, headers=self.headers)
        return response.content.decode()

    def parse_str(self, sound_code) :  # Extract data
        html = etree.HTML(sound_code)
        li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
        item = []
        for li in li_list:
            title = li.xpath('div/div[2]/div[1]/a/span[1]/text()')
            film_review = li.xpath('div/div[2]/div[2]/p[2]/span/text()')
            if film_review == None:
                continue
            img = li.xpath('div/div[1]/a/img/@src')
            item.append(title)
            item.append(film_review)
            item.append(img)
        return item

    def save_html(self, content_list) :  Save the source code of the page
        with open("douban.html"."w", encoding="utf-8")as f:
            f.write(content_list)

    def run(self) :  # start
        # while True:
        sound_code = self.parse_url()
        self.save_html(sound_code)
        content_list = self.parse_str(sound_code)

        # save = self.save_str(content_list)
        print(content_list)


if __name__ == '__main__':
    for i in range(0.226.25):
        url = "https://movie.douban.com/top250?start=" + str(i)
        spider = DouBan(url)
        spider.run()
Copy the code