Pandas+Seaborn+Plotly: Explore the Apple AppStore

Public account: You and the cabin by: Peter Editor: Peter

Hello, I’m Peter

Today I share with you a new article about kaggle: the AppleStore visualization exploration based on Seaborn+Plotly. This is a data analysis case based entirely on statistics + visualization.

The original notebook seaborn library, a lot of small make up with plotly implementation to the graphics, original article address: www.kaggle.com/adityapatil…

Import libraries

import pandas as pd
import numpy as np

# visualization
from matplotlib import pyplot as plt
import seaborn as sns

import plotly_express as px
import plotly.graph_objects as go
Copy the code

Basic data Information

Read and view basic information:

 # 1. Overall size
data.shape
(7197.16)

# 2. Missing values
data.isnull().sum(a)id                  0
track_name          0
size_bytes          0
currency            0
price               0
rating_count_tot    0
rating_count_ver    0
user_rating         0
user_rating_ver     0
ver                 0
cont_rating         0
prime_genre         0
sup_devices.num     0
ipadSc_urls.num     0
lang.num            0
vpp_lic             0
dtype: int64
  
# 3, Field type
data.dtypes

id                    int64
track_name           object
size_bytes            int64
currency             object
price               float64
rating_count_tot      int64
rating_count_ver      int64
user_rating         float64
user_rating_ver     float64
ver                  object
cont_rating          object
prime_genre          object
sup_devices.num       int64
ipadSc_urls.num       int64
lang.num              int64
vpp_lic               int64
dtype: object
Copy the code

Typically, descriptive statistics of the data are also viewed (for numeric fields) :

APP information statistics

Number of free apps

sum(data.price == 0)

4056
Copy the code

Number of apps that cost more than 50

Prices over 50 are super expensive apps.

sum(data.price >= 50)

7
Copy the code

The proportion of prices over 50

Sum ((data.price > 50)/len(data.price) * 100) 0.09726274836737528Copy the code

Sum (data. Price >= 50)/len(data) * 100 0.09726274836737529Copy the code

Data from the group

APP information that costs more than 50

outlier = data[data.price > 50][['track_name','price','prime_genre','user_rating']]
outlier
Copy the code

Free APP

Select data from free apps

APP in the normal range

Take a few

paidapps = data[(data["price"] > 0) & (data.price < 50)]

# Maximum and minimum of normal price range
print("max_price:".max(paidapps.price))
print("min_price:".min(paidapps.price))

max_price: 49.99
min_price: 0.99
Copy the code

Price distribution

plt.style.use("fivethirtyeight")
plt.figure(figsize=(12.10))

# 1. Draw the histogram
# 2*1*1 the first graph of two rows and one column
plt.subplot(2.1.1)  # position
plt.hist(paidapps.price, log=True)  # Draw the histogram
# title and label value
plt.title("Price distribution of apps (Log scale)")
plt.ylabel("Frequency Log scale")
plt.xlabel("Price Distributions in ($) ")

# 2. Plot stripplot
# 2 graph of two rows and one column
plt.subplot(2.1.2)
plt.title("Visual Price distribution")
sns.stripplot(data=paidapps,  # Overall data
              y="price".# Field to be drawn
              jitter=True.# Use this parameter to adjust data points when there is a lot of overlap
              orient="h".# Horizontal display h- horizontal V - vertical
              size=6
             )
plt.show()
Copy the code

Conclusion 1

As prices rise, the number of paid apps drops exponentially
Few apps cost more than $30; So try to keep prices below 30

The effect of category on the price distribution

data.columns  # data field
Copy the code

Index(['id'.'track_name'.'size_bytes'.'currency'.'price'.'rating_count_tot'.'rating_count_ver'.'user_rating'.'user_rating_ver'.'ver'.'cont_rating'.'prime_genre'.'sup_devices.num'.'ipadSc_urls.num'.'lang.num'.'vpp_lic'],
      dtype='object')
Copy the code

Type and Number

data["prime_genre"].value_counts()
Copy the code

Games                3862
Entertainment         535
Education             453
Photo & Video         349
Utilities             248
Health & Fitness      180
Productivity          178
Social Networking     167
Lifestyle             144
Music                 138
Shopping              122
Sports                114
Book                  112
Finance               104
Travel                 81
News                   75
Weather                72
Reference              64
Food & Drink           63
Business               57
Navigation             46
Medical                23
Catalogs               10
Name: prime_genre, dtype: int64
Copy the code

Display the first five categories

# y-range
yrange = [0.25]
fsize =15
plt.figure(figsize=(12.10))

Draw 5 subgraphs respectively

Figure 1 #
plt.subplot(5.1.1)
plt.xlim(yrange)
# Pick out the first type of data
games = paidapps[paidapps["prime_genre"] = ="Games"]
sns.stripplot(data=games,
              y="price",
              jitter=True,
              orient="h",
              size=6,
              color="#eb5e66"
             )
plt.title("Games", fontsize=fsize)
plt.xlabel("")

Figure 2 #
plt.subplot(5.1.2)  
plt.xlim(yrange)
# Pick out the first type of data
ent = paidapps[paidapps["prime_genre"] = ="Entertainment"]
sns.stripplot(data=ent,
              y="price",
              jitter=True,
              orient="h",
              size=6,
              color="#ff8300"
             )
plt.title("Entertainment", fontsize=fsize)
plt.xlabel("")


# 3
plt.subplot(5.1.3)
plt.xlim(yrange)
edu = paidapps[paidapps.prime_genre=='Education']
sns.stripplot(data=edu,y='price',jitter= True ,orient ='h' ,size=6,color='#20B2AA')
plt.title('Education',fontsize=fsize)
plt.xlabel(' ') 

# 4
plt.subplot(5.1.4)
plt.xlim(yrange)
pv = paidapps[paidapps.prime_genre=='Photo & Video']
sns.stripplot(data=pv,
              y='price',
              jitter= True,
              orient ='h',
              size=6,
              color='#b84efd')
plt.title('Photo & Video',fontsize=fsize)
plt.xlabel(' ') 

# Figure 5(Personal additions)
plt.subplot(5.1.5)
plt.xlim(yrange)
ut = paidapps[paidapps.prime_genre=='Utilities']
sns.stripplot(data=ut,
              y='price',
              jitter= True,
              orient ='h',
              size=6,
              color='#084cfd')
plt.title('Utilities',fontsize=fsize)
plt.xlabel(' ') 
Copy the code

Conclusion 2

Games Games apps are relatively expensive and widely distributed, up to $25
Entertainment apps are relatively cheap

Paid apps Vs Free apps

A comparison between paid apps and free apps

Types of app

# Types of apps

categories = data["prime_genre"].value_counts()
categories
Copy the code

Games                3862
Entertainment         535
Education             453
Photo & Video         349
Utilities             248
Health & Fitness      180
Productivity          178
Social Networking     167
Lifestyle             144
Music                 138
Shopping              122
Sports                114
Book                  112
Finance               104
Travel                 81
News                   75
Weather                72
Reference              64
Food & Drink           63
Business               57
Navigation             46
Medical                23
Catalogs               10
Name: prime_genre, dtype: int64
Copy the code

len(categories)

23
Copy the code

Select the first four

Select the first four and mark all Other apps as Other

s = categories.index[:4]
s

Index(['Games'.'Entertainment'.'Education'.'Photo & Video'], dtype='object')
Copy the code

def categ(x) :
    if x in s:
        return x
    else:
        return "Others"
    
data["broad_genre"] = data["prime_genre"].apply(categ)
data.head()
Copy the code

Count the number of categories under free and paid apps

# free
data[data.price==0].broad_genre.value_counts()
Copy the code

Games            2257
Others           1166
Entertainment     334
Photo & Video     167
Education         132
Name: broad_genre, dtype: int64
Copy the code

Combine the two data:

Statistical comparison

Highlight maximum value (personal increment)

Conclusion 3

From the highlighted results above, we find that:

Games-related apps are the most popular, no matter paid or free
In terms of the proportion of paid students, Education takes up the largest proportion
In terms of free, Entertainment is the largest category

Paid versus free

Generate the data

Compare the paid and free percentages in groups

list_free = dist.free_per.tolist()
list_free
Copy the code

[29.13907284768212.62.42990654205608.58.44122216468152.58.35835835835835.47.85100286532951]
Copy the code

# list to tuple
tuple_free = tuple(list_free)
Copy the code

# Same operation for payment type
tuple_paidapps = tuple(dist.paid_per.tolist())
Copy the code

A histogram

plt.figure(figsize=(12.8))
N = 5
ind = np.arange(N)
width = 0.56  # Width between two columns

p1 = plt.bar(ind, tuple_free, width, color="#45cea2")
p2 = plt.bar(ind,tuple_paidapps,width,bottom=tuple_free,color="#fdd400")

plt.xticks(ind,tuple(dist.index.tolist()))
plt.legend((p1[0],p2[0), ("free"."paid"))
plt.show()
Copy the code

The pie chart

# Chart the pie
pies = dist[['free_per'.'paid_per']]
pies.columns=['free %'.'paid %']
pies
Copy the code

plt.figure(figsize=(15.8))
pies.T.plot.pie(subplots=True.# display subgraph
                 figsize=(20.4),  # size
                 colors=['#45cea2'.'#fad470']  # color
                )
plt.show()
Copy the code

Conclusion 4

In educational apps, the proportion of paid apps is very high
On the contrary, the proportion of free and free apps in entertainment category is very high

Are paid apps really good enough?

Price classification

0-free >0 paid

data["category"] = data["price"].apply(lambda x: "Paid" if x > 0 else "Free")
data.head()
Copy the code

The violin figure

plt.figure(figsize=(15.8))
plt.style.use("fast")
plt.ylim([0.5])

plt.title("Distribution of User ratings")

sns.violinplot(data=data, # Data +2 axes
               y="user_rating",
               x="broad_genre",
               hue="category".# grouping
               vertical=True.# Vertical display
               kde=False,
               split=True.# Display all violins in the same category
               linewidth=2,
               scale="count",
               palette=['#fdd470'.'#45cea2']
              )

plt.xlabel("")
plt.ylabel("Rating(0-5)")

plt.show()
Copy the code

Conclusion 5 (Individual increase)

In the Education APP, the proportion of paid is significantly higher than that of free; The second is Photo & Video
For Entertainment apps, free accounts for a higher proportion than paid. And the overall proportion of the distribution is wider

Notice that the split argument has been changed in the following code:

plt.figure(figsize=(15.8))
plt.style.use("fast")
plt.ylim([0.5])

plt.title("Distribution of User ratings")

sns.violinplot(data=data, 
               y="user_rating",
               x="broad_genre",
               hue="category",  
               vertical=True, 
               kde=False,
               split=False.# pay attention to this parameter
               linewidth=2,
               scale="count",
               palette=['#fdd470'.'#45cea2']
              )

plt.xlabel("")
plt.ylabel("Rating(0-5)")

plt.show()
Copy the code

Relationship between size and price

Exploration: Is the higher the price, the bigger the size?

sns.color_palette("husl".8)
sns.set_style("whitegrid")

flatui = ["#9b59b6"."#3498db"."#95a5a6"."#e74c3c"."#34495e"."#2ecc71"]

Change the number of bytes
data["MB"] = data.size_bytes.apply(lambda x: x/1048576)
# Select interval data
paidapps_regression =data[((data.price<30) & (data.price>0))]

sns.lmplot(data=paidapps_regression,
           x="MB",
           y="price",
           size=4,
           aspect=2,
           col_wrap=2,
           hue="broad_genre",
           col="broad_genre",
           fit_reg=False,
           palette=sns.color_palette("husl".5)
          )

plt.show()
Copy the code

Implementation using Plotly (Personal increment)

Add the plotly implementation method

px.scatter(paidapps_regression,
           x="MB",
           y="price",
           color="broad_genre",
           facet_col="broad_genre",
           facet_col_wrap=2
          )
Copy the code

APP classification: whether it can be divided according to paid and free

5 types of proportion

# 1. Set the color and size
BlueOrangeWapang = ['#fc910d'.'#fcb13e'.'#239cd3'.'#1674b1'.'#ed6d50']
plt.figure(figsize=(10.10))

# 2. Data
label_names=data.broad_genre.value_counts().sort_index().index
size = data.broad_genre.value_counts().sort_index().tolist()

# 3, inline blank circle
my_circle=plt.Circle((0.0), 0.5, color='white')
# 4, round
plt.pie(size, labels=label_names, colors=BlueOrangeWapang)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()
Copy the code

How to use Plotly:

How is # Plotly implemented

fig = px.pie(values=size,
             names=label_names,
             labels=label_names,
             hole=0.5)

fig.update_traces(textposition='inside', textinfo='percent+label') 

fig.show()
Copy the code

5 types + pay or not

f=pd.DataFrame(index=np.arange(0.10.2),
               data=dist.free.values,  # free
               columns=['num'])
p=pd.DataFrame(index=np.arange(1.11.2),
               data=dist.paid.values,  # paid
               columns=['num'])

final = pd.concat([f,p],names=['labels']).sort_index()
final
Copy the code

plt.figure(figsize=(20.20))

group_names=data.broad_genre.value_counts().sort_index().index
group_size=data.broad_genre.value_counts().sort_index().tolist()
h = ['Free'.'Paid']

subgroup_names= 5*h
sub= ['#45cea2'.'#fdd470']
subcolors= 5*sub
subgroup_size=final.num.tolist()

# outer
fig, ax = plt.subplots()
ax.axis('equal')
mypie, _ = ax.pie(group_size, radius=2.5, labels=group_names, colors=BlueOrangeWapang)
plt.setp( mypie, width=1.2, edgecolor='white')

# the inner
mypie2, _ = ax.pie(subgroup_size, radius=1.6, labels=subgroup_names, labeldistance=0.7, colors=subcolors)
plt.setp( mypie2, width=0.8, edgecolor='white')
plt.margins(0.0)

plt.show()
Copy the code

Plotly based implementation:

How is # plotly implemented
fig = px.sunburst(
  data,
  path=["broad_genre"."category"],
  values="MB"
)

fig.show()
Copy the code