In [7]:
# %load hello.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
In [ ]:
项目1:电商订单分析
In [ ]:
#1.加载数据
In [9]:
df = pd.read_excel('wenjian/某电商平台2021年订单数据.xlsx',index_col='id')
df.head(3)
Out[9]:
orderID userID goodsID orderAmount payment chanelID platfromType orderTime payTime chargeback
id
1 sys-2020-254118088 customer-157213 JDG-00006491 495.67 495.67 渠道-39 APP 2020-02-14 12:20:36 2021-02-28 13:38:41 False
2 sys-2020-263312190 customer-191121 JDG-00058390 634.04 634.04 渠道-76 微 信 2020-08-14 09:40:34 2021-01-01 14:47:14 True
3 sys-2020-188208169 customer-211918 JDG-00008241 953.73 939.28 渠道-53 薇·信 2020-11-02 20:17:25 2021-01-19 20:06:35 False
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 104557 entries, 1 to 104557
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   orderID       104557 non-null  object        
 1   userID        104557 non-null  object        
 2   goodsID       104557 non-null  object        
 3   orderAmount   104557 non-null  float64       
 4   payment       104557 non-null  float64       
 5   chanelID      104549 non-null  object        
 6   platfromType  104557 non-null  object        
 7   orderTime     104557 non-null  datetime64[ns]
 8   payTime       104557 non-null  datetime64[ns]
 9   chargeback    104557 non-null  bool          
dtypes: bool(1), datetime64[ns](2), float64(2), object(5)
memory usage: 8.1+ MB
In [ ]:
#2.数据清洗
In [10]:
df.rename(columns={'chanelID':'channelID','platfromType':'platformType','chargeback':'isback'},inplace=True)
In [ ]:
#清洗非2021年的订单
In [12]:
index = df[df.orderTime.dt.year != 2021].index
df.drop(index=index,inplace=True)
df.shape
Out[12]:
(104296, 10)
In [ ]:
#清洗跟业务规则不吻合的订单
In [13]:
index = df[(df.orderAmount < 0) | (df.payment < 0) | (df.payTime < df.orderTime)].index
df.drop(index=index,inplace=True)
df.shape
Out[13]:
(104281, 10)
In [15]:
index = df[(df.payTime - df.orderTime).dt.total_seconds() > 1800].index
df.drop(index=index,inplace=True)
df.shape
Out[15]:
(103344, 10)
In [16]:
# 计算平均折扣
a,b = df.query('payment <= orderAmount')[['orderAmount','payment']].sum()
mean_dis_rate = b / a
mean_dis_rate
Out[16]:
0.9540948923974325
In [ ]:
# 将支付金额大于订单金额的订单金额修改为订单金额*平均折扣
In [17]:
df['payment'] = np.where(df.payment <= df.orderAmount,df.payment,(df.orderAmount * mean_dis_rate).round(2))
In [ ]:
df
In [10]:
# 处理空值
channel_mode = df.channelID.mode()[0]   #众数
df['channelID'] = df.channelID.fillna(channel_mode)   #用众数填充空值
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 103344 entries, 6 to 104301
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   orderID       103344 non-null  object        
 1   userID        103344 non-null  object        
 2   goodsID       103344 non-null  object        
 3   orderAmount   103344 non-null  float64       
 4   payment       103344 non-null  float64       
 5   channelID     103344 non-null  object        
 6   platformType  103344 non-null  object        
 7   orderTime     103344 non-null  datetime64[ns]
 8   payTime       103344 non-null  datetime64[ns]
 9   isback        103344 non-null  bool          
dtypes: bool(1), datetime64[ns](2), float64(2), object(5)
memory usage: 8.0+ MB
In [ ]:
#处理异常值
In [18]:
df['platformType'] = df.platformType.replace(
    r'[.\s]','',regex=True
).str.title().replace(
    '·','',regex=True
).replace(
    '薇信|Vx','微信',regex=True
).replace(
    '网站|网页','Web',regex=True
)
In [19]:
df.platformType.unique()
Out[19]:
array(['微信', 'App', '支付宝', 'Web'], dtype=object)
In [20]:
#数据预处理
def num_to_weekday(num):
    items = ['一','二','三','四','五','六','日']
    return f'星期{items[num]}'
df['month'] = df.orderTime.dt.strftime('%m月')
df['weekday'] = df.orderTime.dt.weekday.map(num_to_weekday)
df['seg'] = df.orderTime.dt.floor('30min').dt.strftime('%H:%M')
df.head(3)
Out[20]:
orderID userID goodsID orderAmount payment channelID platformType orderTime payTime isback month weekday seg
id
6 sys-2021-279103297 customer-146548 JDG-00056475 667.33 667.33 渠道-76 微信 2021-01-01 00:12:23 2021-01-01 00:13:37 False 01月 星期五 00:00
7 sys-2021-316686066 customer-104210 JDG-00070902 1834.13 1776.80 渠道-39 微信 2021-01-01 00:23:06 2021-01-01 00:23:32 False 01月 星期五 00:00
8 sys-2021-306447069 customer-104863 JDG-00049963 837.32 818.33 渠道-00 微信 2021-01-01 01:05:50 2021-01-01 01:06:17 False 01月 星期五 01:00

任务一:统计GMV/净销售额/客单价/AOV/拒退率/复购率指标¶

In [21]:
GMV= df.orderAmount.sum()
sales = df.query('not isback').payment.sum()
ARPPU = sales / df.userID.nunique()
AOV = sales / df.orderID.nunique()
back_rate = df.query('isback').orderID.nunique() / df.orderID.nunique()
temp = df.query('not isback').groupby('userID').orderID.nunique()
fu = temp[temp > 1].count()  / temp.count()
In [22]:
print(f"""
GMV:¥{GMV:,.2f}元
净销售额:¥{sales:,.2f}元
客单价:¥{ARPPU:,.2f}元
AOV:¥{AOV:,.2f}元
拒退率:{back_rate:.2%}
复购率:{fu:.2%}
""")
GMV:¥127,098,751.96元
净销售额:¥105,127,546.94元
客单价:¥1,336.65元
AOV:¥1,017.52元
拒退率:13.18%
复购率:22.60%

In [ ]:
# 任务二:统计出月度销售额 月环比 并绘制出折线图
In [23]:
temp1 = df.query('not isback').groupby('month').payment.sum().to_frame()
temp1['月环比'] = temp1.payment.pct_change()
temp1.style.format(
    formatter={
        'payment':'{:,.2f}',
        '月环比':'{:.2%}'
    },
    na_rep='-----'
)
Out[23]:
  payment 月环比
month    
01月 6,769,318.22 -----
02月 5,364,705.92 -20.75%
03月 6,590,363.88 22.85%
04月 7,067,771.80 7.24%
05月 9,701,036.23 37.26%
06月 10,377,133.88 6.97%
07月 9,799,773.23 -5.56%
08月 10,039,151.28 2.44%
09月 9,393,501.29 -6.43%
10月 9,021,093.71 -3.96%
11月 10,611,667.01 17.63%
12月 10,392,030.49 -2.07%
In [120]:
temp1.plot(
    figsize=(8,4),
    kind='line',
    y='payment'
)
plt.ylim(0,14000000)
plt.xticks(np.arange(12),temp1.index)
plt.show()
No description has been provided for this image
In [123]:
#%pip install pyecharts
In [124]:
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
In [128]:
from pyecharts.charts import Bar,Line
from pyecharts import options as opts
from pyecharts.globals import ThemeType
In [156]:
x1 = temp1.index.values.tolist()
y1 =  (temp1.payment / 1e6).round(2).values.tolist()
In [157]:
line = Line(init_opts=opts.InitOpts(width='720px', height='360px'))
line.set_global_opts(
    title_opts=opts.TitleOpts(
        title='2021年月度净销售额',
        title_link='http://qfedu.com',
        pos_left='center',
        pos_top='5%',
        title_textstyle_opts=opts.TextStyleOpts(
            font_family='苹方',
            font_size=18,
            color='red',
        )
    ),
    toolbox_opts=opts.ToolboxOpts(
        is_show=True,
        orient='vertical',
        pos_top='center',
    )
)
line.add_xaxis(x1)
line.add_yaxis(
    '',
    y1,
    markline_opts=opts.MarkLineOpts(
        data=[opts.MarkLineItem(type_='average')]
    ),
)
line.load_javascript()
line.render_notebook()
Out[157]:
In [ ]:
#任务三,统计出周一到周日每天的下单量
In [164]:
temp2 = df.query('not isback').groupby('weekday').orderID.count()
temp2
Out[164]:
weekday
星期一    11425
星期三    13233
星期二    12466
星期五    14308
星期六    12515
星期四    14448
星期日    11332
Name: orderID, dtype: int64
In [165]:
bar = Bar()
bar.add_xaxis(temp2.index.values.tolist())
bar.add_yaxis("销量", temp2.values.tolist())
bar.render_notebook()
Out[165]:
In [166]:
#任务四:统计每天各个时段的下单量
In [169]:
temp3 = df.groupby('seg').orderID.nunique()
temp3
Out[169]:
seg
00:00    1064
00:30     632
01:00     382
01:30     267
02:00     139
02:30     109
03:00      72
03:30      50
04:00      33
04:30      18
05:00      20
05:30      43
06:00      68
06:30     101
07:00     182
07:30     268
08:00     418
08:30     594
09:00     929
09:30    1133
10:00    1663
10:30    1860
11:00    2352
11:30    3044
12:00    4185
12:30    5402
13:00    6654
13:30    6492
14:00    5750
14:30    4779
15:00    3669
15:30    2847
16:00    2374
16:30    2135
17:00    2085
17:30    2140
18:00    2395
18:30    2876
19:00    3773
19:30    4517
20:00    4822
20:30    4632
21:00    4019
21:30    3475
22:00    2883
22:30    2488
23:00    1945
23:30    1566
Name: orderID, dtype: int64
In [170]:
bar = Bar()
bar.add_xaxis(temp3.index.values.tolist())
bar.add_yaxis('', temp3.values.tolist(), label_opts=opts.LabelOpts(is_show=False))
bar.load_javascript()
Out[170]:
In [171]:
bar.render_notebook()
Out[171]:
In [ ]:
# 统计月度复购率(以自然月为时间窗口统计复购用户比例)
In [21]:
temp4 = pd.pivot_table(
    df.query('not isback'),
    index='userID',
    columns='month',
    values='orderID',
    aggfunc='nunique'
)
result = (temp4[temp4 > 1 ].count() / temp4.count()).to_frame()
result.rename(columns={0:'复购率'}).style.format(formatter='{:.2%}')
Out[21]:
  复购率
month  
01月 1.47%
02月 0.92%
03月 1.58%
04月 1.74%
05月 2.64%
06月 2.64%
07月 2.08%
08月 2.71%
09月 2.19%
10月 2.49%
11月 2.69%
12月 2.62%
In [ ]:
# 构建RFM模型做用户价值分群
In [47]:
temp5 = pd.pivot_table(
    df.query('not isback'),
    index='userID',
    values=['orderTime','orderID','payment'],
    aggfunc={
        'orderTime':'max',
        'orderID':'nunique',
        'payment':'sum'
    }
)
In [46]:
from datetime import datetime
In [48]:
ref_date = datetime(2021,12,31)
temp5['R'] = (ref_date - temp5.orderTime).dt.days
temp5.rename(columns={'orderID':'F','payment':'M'},inplace=True)
temp5.drop(columns='orderTime',inplace=True)
In [49]:
def r2level(value):
    if value <= 14: return 5
    elif value <= 30: return 4
    elif value <= 90: return 3
    elif value <= 180: return 2
    return 1


temp5['R'] = temp5.R.map(r2level)
temp5
Out[49]:
F M R
userID
customer-100000 1 2109.52 3
customer-100003 1 870.17 1
customer-100006 1 523.02 3
customer-100007 1 2244.56 1
customer-100008 1 5018.60 3
... ... ... ...
customer-299980 1 644.00 3
customer-299983 1 949.91 5
customer-299989 2 2026.52 3
customer-299992 1 615.45 1
customer-299995 1 467.64 1

70604 rows × 3 columns

In [50]:
temp5['F'] = temp5.F.map(lambda x: 5 if x >= 5 else x)
temp5
Out[50]:
F M R
userID
customer-100000 1 2109.52 3
customer-100003 1 870.17 1
customer-100006 1 523.02 3
customer-100007 1 2244.56 1
customer-100008 1 5018.60 3
... ... ... ...
customer-299980 1 644.00 3
customer-299983 1 949.91 5
customer-299989 2 2026.52 3
customer-299992 1 615.45 1
customer-299995 1 467.64 1

70604 rows × 3 columns

In [51]:
def m2level(value):
    if value < 300: return 1
    elif value < 800: return 2
    elif value < 1200: return 3
    elif value < 2500: return 4
    return 5


temp5['M'] = temp5.M.map(m2level)
temp5
Out[51]:
F M R
userID
customer-100000 1 4 3
customer-100003 1 3 1
customer-100006 1 2 3
customer-100007 1 4 1
customer-100008 1 5 3
... ... ... ...
customer-299980 1 2 3
customer-299983 1 3 5
customer-299989 2 4 3
customer-299992 1 2 1
customer-299995 1 2 1

70604 rows × 3 columns

In [52]:
temp6 = temp5 > temp5.mean()
In [ ]:
temp5[temp6] = '高'
In [54]:
temp5[~temp6] = '低'
In [55]:
temp5['tag'] =  temp5.F + temp5.M + temp5.R
In [56]:
def make_tag(tag):
    if tag == "高高高": return "重要价值客户"
    elif tag == "高低高" : return "重要发展客户"
    elif tag == "低高高" : return "重要保持客户"
    elif tag == "低低高" : return "重要挽留客户"
    elif tag == "高高低" : return "一般价值客户"
    elif tag == "高低低" : return "一般发展客户"
    elif tag == "低高低" : return "一般保持客户"
    return "一般挽留客户"
In [57]:
temp5['usertype'] = temp5.tag.map(make_tag)
In [58]:
temp5
Out[58]:
F M R tag usertype
userID
customer-100000 低 高 高 低高高 重要保持客户
customer-100003 低 低 低 低低低 一般挽留客户
customer-100006 低 低 高 低低高 重要挽留客户
customer-100007 低 高 低 低高低 一般保持客户
customer-100008 低 高 高 低高高 重要保持客户
... ... ... ... ... ...
customer-299980 低 低 高 低低高 重要挽留客户
customer-299983 低 低 高 低低高 重要挽留客户
customer-299989 高 高 高 高高高 重要价值客户
customer-299992 低 低 低 低低低 一般挽留客户
customer-299995 低 低 低 低低低 一般挽留客户

70604 rows × 5 columns

In [60]:
temp7 = temp5.reset_index().groupby('usertype').userID.nunique()
In [61]:
temp7.plot(
    figsize=(5, 5),
    kind='pie',
    ylabel='',
    autopct='%.2f%%',
    pctdistance=0.8,
    wedgeprops={'width':0.35},
)

plt.show()
No description has been provided for this image