Let’s take jingdong notebook computer as an example to analyze today

First of all, let’s analyze the webpage of the notebook in Beijing East Mall. This time, as long as we analyze the source code of the webpage, we can get the price, title, number of comments, business name and business nature of the notebook.

Crawl code

from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait  from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium import webdriver from bs4 import BeautifulSoup import pymongo import timeConnect to database
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.JD_products
collection = db.products
# start browser
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 50)
def to_mongodb(data):
# Store data information
try:
collection.insert(data)
print("Insert The Data Successfully")
except:
print('Insert The Data Failed')
def search():
browser.get('https://www.jd.com/')
try:
# Find the search box and search button, enter the information and click the button
input = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))
input[0].send_keys('notebook')
submit.click()
# Find the laptop button and the sales button, click the buttons in sequence
button_1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_selector > div:nth-child(2) > div > div.sl-value > div.sl-v-list > ul > li:nth-child(1) > a")))
button_1.click()
button_2 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_filter > div.f-line.top > div.f-sort > a:nth-child(2)")))
button_2.click()
Get the total number of pages
page = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))
return page[0].text
except TimeoutException:
search()
def next_page(page_number):
try:
# Slide to the bottom of the page and load up all the product information
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10)
html = browser.page_source
parse_html(html)
When the page reaches page 100, the next button is disabled, so select end program
while page_number == 101:
exit(a)Find the next page button and click the button
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')))
button.click()
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))
# Judge successful page turn
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)))
except TimeoutException:
return next_page(page_number)
def parse_html(html):
"""Resolving product List page"""
data = {}
soup = BeautifulSoup(html, 'html.parser')
goods_info = soup.select('.gl-item')
Check the number of items on the current page to see if there are any unloaded items
quantity = 'item: ' + str(len(goods_info))
print(quantity)
for info in goods_info:
# Get product title information
title = info.select('.p-name.p-name-type-2 a em')[0].text.strip()
title = title.replace('Love thing'.' ')
print("title: ", title)
data['_id'] = title
# Get commodity price information
price = info.select('.p-price i')[0].text.strip()
price = int(float(price))
print("price: ", price)
data['price'] = price
# Get the number of reviews for an item
commit = info.select('.p-commit strong')[0].text.strip()
commit = commit.replace('Comment'.' ')
if '万' in commit:
commit = commit.split("万")
commit = int(float(commit[0]) * 10000)
else:
commit = int(float(commit.replace('+'.' ')))
print("commit: ", commit)
data['commit'] = commit
Get the store name of the item
shop_name = info.select('.p-shop a')
if (len(shop_name)) == 1:
print("shop_name: ", shop_name[0].text.strip())
data['shop_name'] = shop_name[0].text.strip()
else:
print("shop_name: ".'jingdong')
data['shop_name'] = 'jingdong'
Get the store property of the item
shop_property = info.select('.p-icons i')
if (len(shop_property)) >= 1:
message = shop_property[0].text.strip()
if message == 'proprietary':
print("shop_property: ", message)
data['shop_property'] = message
else:
print("shop_property: ".'Not proprietary')
data['shop_property'] = 'Not proprietary'
else:
print("shop_property: ".'Not proprietary')
data['shop_property'] = 'Not proprietary'
to_mongodb(data)
print(data)
print("")
def main():
total = int(search())
print(total)
for i in range(2, total+2):
time.sleep(20)
print("The first", i-1, Page: "")
next_page(i)
if __name__ == "__main__":
main()
Copy the code

Although I searched with the keyword “notebook” at the beginning, I still need to click the “notebook” button again. This is because the direct search for “notebook” will show the notebook that usually takes notes in class, so useless information will be obtained. Therefore, we can obtain the information we want by using jingdong’s own more detailed classification.

Each page had 60 items of product information, so there should be 6,000 items of product information on the laptop, but only 5,992 items were retrieved.

Two reasons are estimated:

1. In MongoDB, the title of goods is repeated as the primary key

2. The web page failed to load all product information

Finally, the successful acquisition of commodity information

Read MongoDB data for visual analysis part of the code

from pyecharts import Bar
import pandas as pd
import pymongo
client = pymongo.MongoClient('localhost', 27017)
db = client.JD_products
table = db.products
df = pd.DataFrame(list(table.find()))
price_info = df['price']
bins = [0, 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, 16000, 19000, 200000]
level = [' '0-2000..'2000-2500'.'2500-3000'.'3000-3500'.'3500-4000'.'4000-5000'.'5000-6000'.'6000-7000'.'7000-8000'.'8000-9000'.'9000-10000'.'10000-12000'.'12000-14000'.'14000-16000'.'16000-19000'.'19000']
price_stage = pd.cut(price_info, bins=bins, labels=level).value_counts().sort_index()
attr = price_stage.index
v1 = price_stage.values
bar = Bar('Bar chart of Notebook Price Distribution', title_pos='center', title_top='10', width=800, height=400)
bar.add(' ', attr, v1, is_stack=True, xaxis_rotate=30, yaxis_min=0, xaxis_interval=0, is_splitline_show=False, is_label_show=True)
bar.render('Bar chart of Notebook Price distribution. HTML')
Copy the code

And crawl before jingdong price made a comparison between computers, plus jingdong double tenth above a give allowance, etc., found that the price is still slightly a little discount, some businesses have to raise prices after the discount, here is to want to buy a laptop buddy a simple parameters, and finally select a suitable for their own laptops. General notebook parameters are as follows:

CPU: Core series I3, I5, I7, standard voltage M and low voltage U

Hard disk: 500 GB, 1 TB, and 2 TB

Graphics card: AMD, NVIDIA

Memory: 4G, 8GB