motivation

I hope I can grab the questions I am interested in from each major technical forum.

technology

puppeteer
dotenv

The detailed design

Core functions: Puppeteer is used to simulate user behavior in the browser and manipulate DOM to obtain data
Quick start: Use Dotenv to write the differences between forums (such as element selectors, etc.) to environment variables, and then configure startup commands in package.json file for quick start

1. Directory structure

|-- env
     |-- csdn.env
     |-- segmentfault.env
|-- index.js
Copy the code

2. Code design

The code logic is very simple, and the main process is as follows:

Open a browser
Create a new page
Jump to the target site
To get the data
Printing data to the console (or writing it to the database)
Close the browser

Because puppeteer can easily emulate user behavior in a browser, the core of DEMO is how to retrieve data, or how to implement the get_news method.

const puppeteer = require("puppeteer");
   
(async () = > {
 // 1. Open browser
 const browser = await puppeteer.launch({});
  // 2. Create a new page
 const page = await browser.newPage();
  // 3. Go to the target website
  await page.goto(url, { waitUntil: "networkidle2" });
  // 4. Get data
  let data = await page.evaluate(get_news);
  // 5. Print out the data in the console
  console.log(data);
  // 6. Close browser
  awaitbrowser.close(); }) ();// Getting news
function get_news() {
    // To do something to get news
}
Copy the code

Take CSDN as an example, as shown in the figure below, we can easily find elements on the page through the native DOM operation Document. querySelector(selector) or jQuery DOM operation $(selector), so as to obtain page information. The Page. Evaluate method in Puppeteer supports both native DOM manipulation and jQuery DOM manipulation, so it’s easy to get page data. The specific code is shown below.

function get_news() {
  let result = [];
  let titles = $(".forums_title");
  let dates = $(".forums_author em");
  titles.map((i, title) = > {
    result.push({
      title: title.text,
      link: title.href,
      date: dates[i].textContent
    });
  });
  return result;
}
Copy the code

Now run the code to print out captured web page data in the console, as shown below. You can also write data to the database.

3. More details

There are a lot of details in the code, because there are detailed comments I will not expand, interested partners can read the code behind. The main technical details are as follows:

How to configure environment variables with Dotenv
How thepuppeteerthepage.evaluateConsole debugging
Question validity time, keyword check
How to configure the quick start command in package.json file

reference

Puppeteer API
puppeteer-examples

Afterword.

I am still a small white on the front end, the code quality may not be high, if there is any problem I hope you point out in the comment area, help small white grow, grateful!!

The complete code

`1. index.js`

const puppeteer = require("puppeteer");
const {
  resolve
} = require("path");
 
(async(path_name, start_time) = >{
  // 1. Analytical path of environmental variables
  let dotenvPath = resolve(__dirname, "env", path_name);
  require("dotenv").config({
    path: dotenvPath
  });
 
  // 2. Open browser
  const browser = await puppeteer.launch({});
 
  // 3. Create a new page
  const page = await browser.newPage();
 
  // Catch headless navigator's console event
  page.on("console", msg = >{
    for (let i = 0; i < msg.args().length; ++i) {
      console.log(`$ {i}: $ {msg.args()[i]}`); }});// 4. Getting env variables
  let tags = JSON.parse(process.env.TAGS);
  let titles = process.env.SELECTOR_TITLES;
  let dates = process.env.SELECTOR_DATES;
  let keywords = JSON.parse(process.env.KEYWORDS);
  let time_interval = process.env.TIME_INTERVAL;
  let para = { path_name, start_time, time_interval, titles, dates, keywords};
 
  // Get page url based on label and page index
  const get_news_url = (tag, pageIndex) = >process.env.LIST_URL.replace("{tag}", tag).replace("{pageIndex}", pageIndex);
 
  // 5. Traverse through all tags to get data
  await Promise.all(tags.map(async tag => {
    let i = 0;
    while (true) {
      // 1) Go to the specified page
      await page.goto(get_news_url(tag, ++i), { waitUntil: "networkidle2" });
      // 2) Get data by function get_news
      let _titles = await page.evaluate(get_news, para);
      // 3) Stop the loop if it can't find the required data
      if (_titles.length === 0) break;
      // 4) Output captured data in console
      console.log(i, get_news_url(tag, i));
      console.log(_titles); }}));// 6. Close browser
  await browser.close();
})(process.env.PATH_NAME, process.env.START_TIME);
 
// Getting news
async
function get_news(para) {
  // Get release time of issue
  const get_release_time = dom= > {
    if (path_name === "csdn.env") return dom.textContent;
    if (path_name === "segmentfault.env") return new Date(dom.dataset.created * 1000);
  }
 
  // Check whether the issue release time is within the valid time interval
  const validate_time = (time, start_time) = > {
    let time_diff = (new Date(time)) - (new Date(start_time));
    return (time_diff > 0) && (time_diff < time_interval);
  }
 
  // Check to see if the keyword is included
  const validate_keyword = (keywords, title) = >!!!!! keywords.find(keyword = >(new RegExp(keyword)).test(title))
 
  // 1. Waiting for callback data
  let { path_name, start_time, time_interval, titles, dates, keywords } = await Promise.resolve(para);
 
  // 2. Traverse the page data to find the required data
  let result = [];
  $(titles).map((i, title) = > {
    // 1) Verify that the data is valid in time
    let check_time = validate_time(get_release_time($(dates)[i]), start_time);
    if(! check_time)return;
    // 2) Verify that the data contains the specified keywords
    let check_keyword = validate_keyword(keywords, a.text);
    if(! check_keyword)return;
    result.push({
      title: title.text,
      link: title.href,
      date: get_release_time($(dates)[i]).toString()
    });
  });
  return result;
}
Copy the code

`2. csdn.env`

LIST_URL=https://bbs.csdn.net/forums/{tag}?page={pageIndex}
TAGS=["CSharp"."DotNET"]
KEYWORDS=[".net"."C#"."c#"]
SELECTOR_TITLES=.forums_topic .forums_title
SELECTOR_DATES=.forums_author em
Copy the code

`3. segmentfault.env`

LIST_URL=https://segmentfault.com/questions/unanswered?page={pageIndex}
TAGS=[""]
KEYWORDS=["js"."mysql"."vue"."html"."javascript"]
SELECTOR_TITLES=.title a
SELECTOR_DATES=.askDate
Copy the code

`4. package.json`

{
  "name": "fetch-question"."version": "1.0.0"."description": "fetch questions from internet"."main": "index.js"."dependencies": {
  "cross-env": "^ 5.2.0." "."dotenv": "^ 7.0.0." "."puppeteer": "^ 1.13.0"
},
  "devDependencies": {},
  "scripts": {
    "csdn:list": "cross-env PATH_NAME=csdn.env START_TIME=2019/3/18 TIME_INTERVAL=172800000 node index.js"."segmentfault:list": "cross-env PATH_NAME=segmentfault.env START_TIME=2019/3/18 TIME_INTERVAL=172800000 node index.js",},"author": "linli"."license": "ISC"
}
Copy the code

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

Puppeteer crawls web page data

motivation

technology

The detailed design

1. Directory structure

2. Code design

3. More details

reference

Afterword.

The complete code

`1. index.js`

`2. csdn.env`

`3. segmentfault.env`

`4. package.json`

Puppeteer crawls web page data

motivation

technology

The detailed design

1. Directory structure

2. Code design

3. More details

reference

Afterword.

The complete code

1. index.js

2. csdn.env

3. segmentfault.env

4. package.json

Related Posts

Front-end must know must know HTTP request series (1) Understand Web and network basics

Axios source code analysis ii

VueRouter source parsing (a) : plug-in installation, several types, several methods

`1. index.js`

`2. csdn.env`

`3. segmentfault.env`

`4. package.json`