Project introduction

Through the URL address of Tencent movies and the number of pages to climb the page data to climb.

The project address

Making the address

Use the library

  • Puppeteer – Fetch data
  • Chalk – Changes console.log
  • File-box – Package images for wechaty to send
  • Fs – Writes files

The basic idea

  • Capture Tencent video movie data
  • Format the printed information
  • Open the link you want to climb
  • Sleep for a few seconds before you crawl to the next page

Install the module

# Install puppeteer NPM install puppeteer --save # Install Puppeteer NPM install puppeteer NPM install puppeteer NPM install puppeteer NPM config set puppeteer_download_host https://npm.taobao.org/mirrors # installation chalk NPM install chalk - saveCopy the code

The final code

const puppeteer = require('puppeteer');
const chalk = require('chalk');
const fs = require('fs');

// Delay execution
const sleep = time= > new Promise(resolve= > {
    setTimeout(resolve, time);
});

/ / the console log shorthand
const log = console.log;
// The number of pages to crawl
const TOTAL_PAGE = 150;

// Crawl the link
// const url = `https://v.qq.com/x/list/movie?itype=-1&offset=0`;
const url = `https://v.qq.com/channel/movie?listpage=1&channel=movie&itype=100062`;

// Format the progress output
function formatProgress(current) {
    let percent = (current / TOTAL_PAGE) * 100;
    let done = ~~(current / TOTAL_PAGE * 40);
    let left = 40 - done;
    let str = 'Current progress: [The ${' '.padStart(done, '=')}The ${' '.padStart(left, The '-')}]  ${percent}% `;
    return str;
}

(async() = > {// Start the browser environment
    const browser = await puppeteer.launch({
        // headless: false,
        // slowMo: 250
    });
    log(chalk.green('Services up and running'))

    try {
        const page = await browser.newPage(); // Open a new page
        // Listen for internal console messages
        page.on('console', message => {
            if (typeof message == 'object') {
                console.dir(message);
            } else {
                log(chalk.blue(message))
            }
        });

        // Open the link to climb
        await page.goto(url, {
            waitUntil: 'networkidle2' // Network idle indicates that the load is complete
        });

        log(chalk.yellow('Page first loaded'));
	await sleep(3000);
        for (let i = 1; i <= TOTAL_PAGE; i++) {
            const submit = await page.$('.page_next'); // Get the next page button
            if(! submit) { chalk.red('Data acquisition completed');
                return;
            }
            await submit.click(); // Simulate clicking to jump to the next page
	    await sleep(3000);
            await page.waitFor(2500); // Wait for the page to load
            console.clear();
            // Prints the current crawl progress
            log(chalk.yellow(formatProgress(i)));
            log(chalk.yellow('Page data loaded'));

            await handleData(); // Execute method
            await sleep(3000);
            await page.waitFor(2500); // Take a short break after a page has been climbed
        }

        await browser.close();
        log(chalk.green('Normal end of service'));

        // Get internal browser content
        async function handleData() {
            const result = await page.evaluate((a)= > {
                var$=window. $;// // get JQuery on the page
                var itemList = $('.list_item'); // Get all items
                var links = []; // Store the crawled data
                // loop into the array
                itemList.each((index, item) = > {
                    let i = $(item);
                    let vid = i.find('.figure').data('float'); // id
                    let link = i.find('.figure').attr('href'); // Link address
                    let star = i.find('.figure_desc').attr('title'); / / star
                    let title = i.find('.figure_pic').attr('alt'); // Movie title
                    let poster = i.find('.figure_pic').attr('src'); // Cover image
                    let count = i.find('.figure_count').text(); / / play
                    // Store it in the previously defined array
                    links.push({
                        vid,
                        title,
                        count,
                        star,
                        poster,
                        link
                    });
                });
                return links; // Return data
            });

            // Write to a JSON file
            fs.writeFile('./movie.json'.JSON.stringify(result, null.'\t', {
                'flag': 'a'
            }), function (err) {
                if (err) {
                    throwerr; }}); log(chalk.yellow('Write data complete')); }}catch (error) {
        console.log(error)
        log(chalk.red('Unexpected Termination of Service'))
        await browser.close()
    } finally {
        process.exit(0);
    }
})();
Copy the code

The first time to write, write bad please forgive, the follow-up will continue to work hard.