1. Douban top250

This is the third day of my participation in Gwen Challenge

# douban top250
Get movie titles, reviews, and images
import requests
from lxml import etree


class DouBan(object) :
    def __init__(self, url) :
        self.url = url
        self.headers = {
            'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

    def parse_url(self) :  # get source code
        response = requests.get(url=self.url, headers=self.headers)
        return response.content.decode()

    def parse_str(self, sound_code) :  # Extract data
        html = etree.HTML(sound_code)
        li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
        item = []
        for li in li_list:
            title = li.xpath('div/div[2]/div[1]/a/span[1]/text()')
            film_review = li.xpath('div/div[2]/div[2]/p[2]/span/text()')
            if film_review == None:
                continue
            img = li.xpath('div/div[1]/a/img/@src')
            item.append(title)
            item.append(film_review)
            item.append(img)
        return item

    def save_html(self, content_list) :  Save the source code of the page
        with open("douban.html"."w", encoding="utf-8")as f:
            f.write(content_list)

    def run(self) :  # start
        # while True:
        sound_code = self.parse_url()
        self.save_html(sound_code)
        content_list = self.parse_str(sound_code)

        # save = self.save_str(content_list)
        print(content_list)


if __name__ == '__main__':
    for i in range(0.226.25):
        url = "https://movie.douban.com/top250?start=" + str(i)
        spider = DouBan(url)
        spider.run()
Copy the code

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

01 Crawler instance – Douban Top250

1. Douban top250

01 Crawler instance – Douban Top250

1. Douban top250

Related Posts

[Spring] BeanDefinitionRegistry – BeanDefinition registration

Distributed cluster architecture scenario-based solution

Clean up the LeetCode series of circular lists with Java brush questions