This article is participating in Python Theme Month. See the link for details

Work to prepare

Static page crawling and dynamic interface fetching were introduced earlier. In fact, crawler is to get information from some commonly used interfaces and pages, and then it is also through the way of crawling elements of the page, taking you to have a cool time in the hot summer through data!!

The local runtime environment is also based on docker, which builds rough details. You can check out the previous article on portal

The code

Demand analysis

If we want to obtain the corresponding urban temperature, we need to know where we can obtain the temperature distribution of cities across the country. Then what we think of is through the weather forecast network, and then summarize the lowest temperature in each region by crawling.

  1. View the city temperature list of each region and find the corresponding column in the page element.
  2. Climb all the city temperature on each page, store and sort to get the top ten.
  3. Output the corresponding temperature city data as histogram.

Write the code

  1. Preferred definition function entry, crawl corresponding data items.
There are 8 regions in total, including: North China, Northeast China, East China, Central China, South China, Northwest China, Southwest China, Hong Kong, Macao and Taiwan
# in north China
url_hb = 'http://www.weather.com.cn/textFC/hb.shtml'

# the northeast
url_db = 'http://www.weather.com.cn/textFC/db.shtml'

# east China
url_hd = 'http://www.weather.com.cn/textFC/hd.shtml'

# huazhong
url_hz = 'http://www.weather.com.cn/textFC/hz.shtml'

# of south China
url_hn = 'http://www.weather.com.cn/textFC/hn.shtml'

# the northwest
url_xb = 'http://www.weather.com.cn/textFC/xb.shtml'

# southwest
url_xn = 'http://www.weather.com.cn/textFC/xn.shtml'

# from Hong Kong, Macao and Taiwan
url_gat = 'http://www.weather.com.cn/textFC/gat.shtml'

url_areas = [url_hb, url_db, url_hd, url_hz, url_hn, url_xb, url_xn, url_gat]

for index, url in enumerate(url_areas):
     print('Start climbing the {} region'.format(index + 1))
     parse_page(url)
     time.sleep(1)
Copy the code
  1. Analyze the data and sort the output target data.
def analysis_data() :
    # 1. The default sort is ascending [sort by lowest temperature]
    ALL_DATA.sort(key=lambda data: data['temp_low'])
    # 2. Get the first 10 data
    top_10 = ALL_DATA[:10]
    return top_10
Copy the code
  1. Output the data as a histogram.
def show_with_chart(top_10) :
    # 1. Get the list of cities
    citys = list(map(lambda item: item['city'], top_10))
    
    # 2. List of minimum temperatures
    temp_lows = list(map(lambda item: item['temp_low'], top_10))
    
    # 3. Generate the histogram and write it to an HTML file
    chart = Bar()
    chart.add_xaxis(citys)
    chart.add_yaxis("City",temp_lows)
    chart.set_global_opts(title_opts=opts.TitleOpts(title="Lowest Temperature in China"))
    chart.render('temperature.html')
Copy the code

Effect of screenshots

The complete code


import requests
from bs4 import BeautifulSoup
import time
from  pyecharts.charts  import Bar
from pyecharts import options as opts



There are 8 regions in total, including: North China, Northeast China, East China, Central China, South China, Northwest China, Southwest China, Hong Kong, Macao and Taiwan
# in north China
url_hb = 'http://www.weather.com.cn/textFC/hb.shtml'

# the northeast
url_db = 'http://www.weather.com.cn/textFC/db.shtml'

# east China
url_hd = 'http://www.weather.com.cn/textFC/hd.shtml'

# huazhong
url_hz = 'http://www.weather.com.cn/textFC/hz.shtml'

# of south China
url_hn = 'http://www.weather.com.cn/textFC/hn.shtml'

# the northwest
url_xb = 'http://www.weather.com.cn/textFC/xb.shtml'

# southwest
url_xn = 'http://www.weather.com.cn/textFC/xn.shtml'

# from Hong Kong, Macao and Taiwan
url_gat = 'http://www.weather.com.cn/textFC/gat.shtml'

url_areas = [url_hb, url_db, url_hd, url_hz, url_hn, url_xb, url_xn, url_gat]

HEADERS = {
    'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'.'Referer': 'http://www.weather.com.cn/textFC/hb.shtml'
}

# Data [city + minimum temperature] list
ALL_DATA = []


def parse_page(url) :
    response = requests.get(url, headers=HEADERS)
    # 1. Get the raw HTML data for the page
    text = response.content.decode('utf-8')
    # note: The table tag in Hong Kong, Macao, Taiwan, Hong Kong is not properly closed, using the LXML parser can not parse correctly. You need to use HTML5lib to auto-complete the code and then parse it
    soup = BeautifulSoup(text, 'html5lib')
    div_conMidtab = soup.find('div', class_='conMidtab')
    # 3. Retrieve all subtags of the table
    tables = div_conMidtab.find_all('table')
    
    # 4. Traverse the provinces below the zone
    for table in tables:
        Filter out two tr data in the header
        trs = table.find_all('tr') [2:] 
        # 5. Traverse the urban areas below the province
        for index, tr in enumerate(trs):
            tds = tr.find_all('td')
            # 5.1 City name [1st TD tag]
            # note: the first city in a province takes the second TD tag; Other cities take the first TD tag
            city_td = tds[1] if index == 0 else tds[0]
            city = list(city_td.stripped_strings)[0]
            # 5.2 Minimum air temperature
            temp_low_td = tds[-2]
            temp_low = list(temp_low_td.stripped_strings)[0]
            ALL_DATA.append({"city": city, "temp_low": int(temp_low)})


def spider() :
    for index, url in enumerate(url_areas):
        print('Start climbing the {} region'.format(index + 1))
        parse_page(url)
        time.sleep(1)


def analysis_data() :
    # 1. The default sort is ascending [sort by lowest temperature]
    ALL_DATA.sort(key=lambda data: data['temp_low'])
    # 2. Get the first 10 data
    top_10 = ALL_DATA[:10]
    return top_10


def show_with_chart(top_10) :
    # 1. Get the list of cities
    citys = list(map(lambda item: item['city'], top_10))
    
    # 2. List of minimum temperatures
    temp_lows = list(map(lambda item: item['temp_low'], top_10))
    
    # 3. Generate the histogram and write it to an HTML file
    chart = Bar()
    chart.add_xaxis(citys)
    chart.add_yaxis("City",temp_lows)
    chart.set_global_opts(title_opts=opts.TitleOpts(title="Lowest Temperature in China"))
    chart.render('temperature.html')


if __name__ == '__main__':
    # 1. Crawl data
    spider()
    
    # 2. Analyze the data
    top_10 = analysis_data()
    
    # 3. Use chart to generate the histogram
    show_with_chart(top_10)
Copy the code