The introduction

This issue is a crawler and data analysis of Tencent’s hit drama – Sword in the Snow, which takes one hour and takes 1W comments, which is very suitable for beginners to practice. What is worth noting is the emotional text analysis and processing of comments, which is the first time to contact with knowledge.

Crawler: As the comment data of Tencent is encapsulated in JSON, we only need to find the JSON file and extract and save the data needed.

  • Video website: v.qq.com/x/cover/mzc…
  • Review the json data url: video.coral.qq.com/varticle/75…
  • Note: Comments of other videos can be climbed by replacing the value of video digital ID

How to find a video ID?

Project Structure:


I. Crawler part:

Spiders. Py

import requests import re import random def get_html(url, params): Uapools = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, Thisua = random. Choice (uapools) headers = {" user-agent ": thisua} r = requests.get(url, headers=headers, Params =params) r.raise_for_status() R.coding = r.aparent_encoding R.coding = 'utF-8 '# return r.ext def parse_page(infolist, data): commentpat = '"content":"(.*?) "' lastpat = '"last":"(.*?) "' commentall = re.compile(commentpat, re.S).findall(data) next_cid = re.compile(lastpat).findall(data)[0] infolist.append(commentall) return next_cid def print_comment_list(infolist): j = 0 for page in infolist: Print ('第' + STR (j + 1) + 'page \n') commentall = page for I in range(0, len(commentall)): print('第' + STR (j + 1) +' page \n') commentall = page for I in range(0, len(commentall)): print(commentall[i] + '\n') j += 1 def save_to_txt(infolist, path): fw = open(path, 'w+', encoding='utf-8') j = 0 for page in infolist: #fw. Write (' d '+ STR (j + 1) +' page \n') commentall = page for I in range(0, len(commentall)): #fw. Write (' d '+ STR (j + 1) +' page \n') Commentall = page for I in range(0, len(commentall)): fw.write(commentall[i] + '\n') j += 1 fw.close() def main(): infolist = [] vid = '7579013546'; cid = "0"; page_num = 3000 url = 'https://video.coral.qq.com/varticle/' + vid + '/comment/v2' #print(url) for i in range(page_num):  params = {'orinum': '10', 'cursor': cid} html = get_html(url, params) cid = parse_page(infolist, html) print_comment_list(infolist) save_to_txt(infolist, 'content.txt') main()Copy the code

2. Crawl comment time code: sp.py

import requests import re import random def get_html(url, params): Uapools = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, Thisua = random. Choice (uapools) headers = {" user-agent ": thisua} r = requests.get(url, headers=headers, Params =params) r.raise_for_status() R.coding = r.aparent_encoding R.coding = 'utF-8 '# return r.ext def parse_page(infolist, data): commentpat = '"time":"(.*?) "' lastpat = '"last":"(.*?) "' commentall = re.compile(commentpat, re.S).findall(data) next_cid = re.compile(lastpat).findall(data)[0] infolist.append(commentall) return next_cid def print_comment_list(infolist): j = 0 for page in infolist: Print ('第' + STR (j + 1) + 'page \n') commentall = page for I in range(0, len(commentall)): print('第' + STR (j + 1) +' page \n') commentall = page for I in range(0, len(commentall)): print(commentall[i] + '\n') j += 1 def save_to_txt(infolist, path): fw = open(path, 'w+', encoding='utf-8') j = 0 for page in infolist: #fw. Write (' d '+ STR (j + 1) +' page \n') commentall = page for I in range(0, len(commentall)): #fw. Write (' d '+ STR (j + 1) +' page \n') Commentall = page for I in range(0, len(commentall)): fw.write(commentall[i] + '\n') j += 1 fw.close() def main(): infolist = [] vid = '7579013546'; cid = "0"; page_num =3000 url = 'https://video.coral.qq.com/varticle/' + vid + '/comment/v2' #print(url) for i in range(page_num): params = {'orinum': '10', 'cursor': cid} html = get_html(url, params) cid = parse_page(infolist, html) print_comment_list(infolist) save_to_txt(infolist, 'time.txt') main()Copy the code

2. Data processing

1. The timestamp of the comment is converted to the normal time time.py

# coding=gbk
import csv
import time

csvFile = open("data.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []
#print(csvRow)
f = open("time.txt",'r',encoding='utf-8')
for line in f:
    csvRow = int(line)
    #print(csvRow)

    timeArray = time.localtime(csvRow)
    csvRow = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    print(csvRow)
    csvRow = csvRow.split()
    writer.writerow(csvRow)

f.close()
csvFile.close()
Copy the code

2. Read the comments into CSV cd.py

# coding=gbk
import csv
csvFile = open("content.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []

f = open("content.txt",'r',encoding='utf-8')
for line in f:
    csvRow = line.split()
    writer.writerow(csvRow)

f.close()
csvFile.close()
Copy the code

3. Count the number of comments in each time period of a day py

# coding=gbk import csv from pyecharts import options as opts from sympy.combinatorics import Subset from wordcloud import WordCloud with open('.. /Spiders/data.csv') as csvfile: reader = csv.reader(csvfile) data1 = [str(row[1])[0:2] for row in reader] print(data1) print(type(data1)) Set_seq = set(data1) RST = [] for item in set_seq: set_seq = set(data1) RST = [] Rst.sort () print(type(RST)) print(RST) with open("time2.csv", "w+", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') for i in rst: Writerow (I) with open('time2.csv') as csvfile: writer.writerow(I) with open('time2.csv') as csvfile reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) with open('time2.csv') as csvfile: reader = csv.reader(csvfile) y1 = [float(row[1]) for row in reader] print(y1)Copy the code

4. Count the number of recent comments py1.py

# coding=gbk import csv from pyecharts import options as opts from sympy.combinatorics import Subset from wordcloud import WordCloud with open('.. /Spiders/data.csv') as csvfile: Reader = csv.reader(csvFile) data1 = [STR (row[0]) for row in reader] #print(data1) print(type(data1)) Set_seq = set(data1) RST = [] for item in set_seq: Rst.sort () print(type(RST)) print(RST) with open("time1.csv", "w+", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') for i in rst: Writerow (I) with open('time1.csv') as csvfile: writer.writerow(I) with open('time1.csv') as csvfile reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) with open('time1.csv') as csvfile: reader = csv.reader(csvfile) y1 = [float(row[1]) for row in reader] print(y1)Copy the code

3. Data analysis

Data analysis: word cloud map, bar, broken line and pie chart are involved. The last three are the analysis of comment time and the proportion of leading actors. However, the comment time of Tencent is displayed in the form of time stamps, so it needs to be converted to count the frequency of occurrence.

1. Make word clouds

wc.py

import numpy as np import re import jieba from wordcloud import WordCloud from matplotlib import pyplot as plt from PIL F = open('content.txt', 'r', encoding='utf-8') # TXT = f.read() # open the file f.close() # close the file. Newtxt = re.sub("[a-za-z0-9!%[],\ ", "", TXT) print(newtxt) words = jieba.lcut(newtxt) img = image.open (r'wc.jpg') # Wordcloud = wordcloud (background_color="white", width=1080, height=960, font_path=".. Otf ", max_words=150, scale=10,# definition max_font_size=100, mask=img_array, collocations=False).generate(newtxt) plt.imshow(wordcloud) plt.axis('off') plt.show() wordcloud.to_file('wc.png')Copy the code

Outline: WC.jpg

Insert a picture description here

Word cloud: result.png (note: the English letters should be filtered out here)

2. Make a bar chart of recent comments drawbar.py

# encoding: utf-8 import csv import pyecharts.options as opts from pyecharts.charts import Bar from pyecharts.globals import ThemeType class DrawBar(object): """ def __init__(self): """ Create a bar graph instance, Self. bar = bar (init_opts= opts.initopts (width='1500px', height='700px', Theme = themetype.light)) def add_x(self): "" with open('time1.csv') as csvfile reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) self.bar.add_xaxis( xaxis_data=x, ) def add_y(self): with open('time1.csv') as csvfile: Reader = CSV. Reader (csvFile) y1 = [float(row[1]) for row in reader] print(y1) "" Self.bar. add_yaxis(# y_name =y1) self.bar.add_yaxis(# y_name =y1) self.bar.add_yaxis(# y_name =y1) LabelOpts(is_show=True,color="black"), # set the tag bar_max_width='100px', Def set_global(self): #self.bar(width=2000,height=1000) self.bar.set_global_opts(title_opts= opts.titLeopts (# set title Title_textstyle_opts = opts.textStyLeopts (font_size=35)), title_textSTYLEOPts = opts.textStyLeopts (font_size=35) Tooltip_opts = opts.tooltipopts (# tooltip configuration item (what is displayed when the mouse moves over the graph) is_show=True, # whether to display tooltip trigger="axis", # Trigger type (axis triggered, Axis_pointer_type ="cross"# Indicator type (cross will generate two dashed lines perpendicular to the X and Y axes respectively, Toolbox_opts = opts.toolboxopts (), # toolbox config (nothing opens all tools by default)) def draw(self): "" "plot "is" "self. Add_x () the self. Add_y () the self. Set_global () the self. Bar. Render ('.. /Html/ drawbar.html ') def run(self): """ """ "self.draw() if __name__ == '__main__': app = DrawBar() app.run()Copy the code

Drawbar.html


3. Make an hourly comment bar chart drawbar2.py


# encoding: utf-8 # encoding: utf-8 import csv import pyecharts.options as opts from pyecharts.charts import Bar from pyecharts.globals import ThemeType class DrawBar(object): """ def __init__(self): """ Create a bar graph instance, Self. bar = bar (init_opts= opts.initopts (width='1500px', height='700px', Theme = themetype.macarons)) def add_x(self): """ str_name1 = 'open' with ('time2.csv') as csvfile: reader = csv.reader(csvfile) x = [str(row[0] + str_name1) for row in reader] print(x) self.bar.add_xaxis( xaxis_data=x )  def add_y(self): with open('time2.csv') as csvfile: Reader = CSV. Reader (csvFile) y1 = [int(row[1]) for row in reader] print(y1) "" Self.bar. add_yaxis(# y_name =y1) self.bar.add_yaxis(# y_name =y1) self.bar.add_yaxis(# y_name =y1) Label_opts = opts.labelopts (is_show=False), def set_global(self): #self.bar(width=2000,height=1000) self.bar.set_global_opts(title_opts= opts.titLeopts (# set title Title_textstyle_opts = opts.textStyLeopts (font_size=35)), title_textSTYLE_opts (font_size=35), Tooltip_opts = opts.tooltipopts (# tooltip configuration item (what is displayed when the mouse moves over the graph) is_show=True, # whether to display tooltip trigger="axis", # Trigger type (axis triggered, Axis_pointer_type ="cross"# Indicator type (cross will generate two dashed lines perpendicular to the X and Y axes respectively, Toolbox_opts = opts.toolboxopts (), # toolbox config (nothing opens all tools by default)) def draw(self): "" "plot "is" "self. Add_x () the self. Add_y () the self. Set_global () the self. Bar. Render ('.. /Html/ drawbar2.html ') def run(self): """ """ "self.draw() if __name__ == '__main__': app = DrawBar() app.run()Copy the code

Drawbar2.html

4. Create a review pie chart pie_Pyecharts.py

import csv from pyecharts import options as opts from pyecharts.charts import Pie from random import randint from pyecharts.globals import ThemeType with open('time1.csv') as csvfile: reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) with open('time1.csv') as csvfile: reader = csv.reader(csvfile) y1 = [float(row[1]) for row in reader] print(y1) num = y1 lab = x ( Pie(init_opts= opts.initopts (width='1700px',height='450px',theme= themetype.light))# 600. Set_global_opts (title_opts= opts.titleopts (title_opts= opts.titleopts (title_opts= opts.titleopts), title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts( pos_top="10%", Add (series_name= "", center=[280, 270], data_pair=[(j, I) for I, j in zip(num, Lab)]) # pie chart. The add (series_name = ', center = [845, 270], data_pair = [(j, I) for I and j in zip (num, lab)], the radius = [' 40% ', '75%']) # ring figure. The add (series_name = ', center = [1380, 270],data_pair=[(j, I) for I in zip(num, lab)], rosetype='radius')# render('pie_pyecharts.html')Copy the code

rendering

5. Create an hourly review pie pie_Pyecharts2.py

import csv from pyecharts import options as opts from pyecharts.charts import Pie from random import randint from Pyecharts.globals import ThemeType str_name1 = ' 'with open('time2.csv') as csvFile: pyecharts.globals import ThemeType str_name1 =' 'with open('time2.csv') as csvFile: reader = csv.reader(csvfile) x = [str(row[0]+str_name1) for row in reader] print(x) with open('time2.csv') as csvfile: reader = csv.reader(csvfile) y1 = [int(row[1]) for row in reader] print(y1) num = y1 lab = x ( Pie(init_opts= opts.initopts (width='1650px',height='500px',theme= themetype.light,))# 600. Set_global_opts (title_opts= opts.titleopts (title_opts= opts.titleopts) ,title_textstyle_opts=opts.TextStyleOpts(font_size=27)), legend_opts=opts.LegendOpts( pos_top="8%", Add (series_name= ", center=[250, 300], data_pair=[(j, I) for I, j in zip(num, Lab)]) # pie chart. The add (series_name = ', center = [810, 300], data_pair = [(j, I) for I and j in zip (num, lab)], the radius = [' 40% ', '75%']) # ring figure. The add (series_name = ', center = [1350, 300],data_pair=[(j, I) for I in zip(num, lab)], rosetype='radius')# render('pie_pyecharts2.html')Copy the code

rendering

6. Create pie_Pyecharts3.py viewing time interval review statistics

# coding=gbk import csv from pyecharts import options as opts from pyecharts.globals import ThemeType from sympy.combinatorics import Subset from wordcloud import WordCloud from pyecharts.charts import Pie from random import randintwith open(/data.csv') as csvfile: reader = csv.reader(csvfile) data2 = [int(row[1].strip('')[0:2]) for row in reader] #print(data2) print(type(data2)) Set_seq = set(data2) list = [] for item in set_seq: set_seq = set(data2) list = [] for item in set_seq: List.sort () print(type(list)) #print(list) with open("time2.csv", "w+", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') for i in list: # For each row, Writerow (I) n = 4 m = int(len(list)/n) list2 = [] for I in range(0, len(list), m): List2. Append (a list [I: (I) + m]) print (" : in the morning, "list2 [0]) print (" : in the morning," list2 [1]) print (" : in the afternoon, "list2 [2]) print (" evening: ",list2[3]) with open('time2.csv') as csvfile: reader = csv.reader(csvfile) y1 = [int(row[1]) for row in reader] print(y1) n =6 groups = [y1[i:i + n] for i in range(0, Len (y1), n)] print(groups) x=[] print(groups) x=[] for y1 in groups: num_sum = 0 for y1: Num_sum += groups str_name1 = 'num_sum' num = y1 lab = x ( Pie(init_opts= opts.initopts (width='1500px',height='450px',theme= themetype.light))# 600. Set_global_opts (title_opts= opts.titleopts (title_opts= opts.titleopts (title_opts= opts.titleopts (title_opts= opts.titleopts), Title_textstyle_opts = opts.textStyLeopts (font_size=30)), legend_opts= opts.legendopts (pos_top="8%", ) .add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, Lab)]) # pie chart. The add (series_name = ', center = [1230, 270], data_pair = [(j, I) for I and j in zip (num, lab)], the radius = [' 40% ', '75%']) # ring figure. The add (series_name = ', center = [750, 270],data_pair=[(j, I) for I, j in zip(num, lab)], rosetype='radius')#Copy the code

rendering

7. Making a knife in the snow star mentioned proportion pie chart pie_Pyecharts4.py

import csv from pyecharts import options as opts from pyecharts.charts import Pie from random import randint from Pyecharts.globals import ThemeType f = open('content.txt', 'r', encoding=' utF-8 ') # Words = f.read() # read file f.close() # close file Print (name) count=[float(words.count(" float ")), float(words.count(" float ")), print(name) count=[float(words.count(" float ")), print(name) count=[float(words.count(" float ")), print(name) count=[float(words.count(" Float (words. Count (" 官 网 "))] print(count) num = count lab = name (float(words. Count (" 官 网 ")) Pie(init_opts= opts.initopts (width='1650px',height='450px',theme= themetype.light))# 60. set_global_opts(title_opts= opts.titleopts (title_opts= opts.titleopts (title_opts= opts.titleopts), title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts( pos_top="3%", Add (series_name= ", center=[280, 270], data_pair=[(j, I) for I, j in zip(num, Lab)]) # pie chart. The add (series_name = ', center = [800, 270], data_pair = [(j, I) for I and j in zip (num, lab)], the radius = [' 40% ', '75%']) # ring figure. The add (series_name = ', center = [1300, 270],data_pair=[(j, I) for I, j in zip(num, lab)], rosetype='radius')#Copy the code

rendering

8. Comment content Sentiment analysis Snownlp.py

Copy the code
import numpy as np
from snownlp import SnowNLP
import matplotlib.pyplot as plt

f = open('content.txt', 'r', encoding='UTF-8')
list = f.readlines()
sentimentslist = []
for i in list:
    s = SnowNLP(i)

    print(s.sentiments)
    sentimentslist.append(s.sentiments)
plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01), facecolor='g')
plt.xlabel('Sentiments Probability')
plt.ylabel('Quantity')
plt.title('Analysis of Sentiments')
plt.show()
Copy the code
Copy the code
Effect diagram (frequency of each score segment of emotion)Copy the code

SnowNLP emotion analysis is implemented based on emotion dictionary. It simply divides the text into two categories, positive and negative, and the return value is the probability of emotion, that is, the emotion score is between [0,1]. The closer to 1, the more positive the emotion performance is; the closer to 0, the more negative the emotion performance is.