In [7]:
# %load hello.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
In [ ]:
项目1:电商订单分析
In [ ]:
#1.加载数据
In [9]:
df = pd.read_excel('wenjian/某电商平台2021年订单数据.xlsx',index_col='id')
df.head(3)
Out[9]:
orderID | userID | goodsID | orderAmount | payment | chanelID | platfromType | orderTime | payTime | chargeback | |
---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||
1 | sys-2020-254118088 | customer-157213 | JDG-00006491 | 495.67 | 495.67 | 渠道-39 | APP | 2020-02-14 12:20:36 | 2021-02-28 13:38:41 | False |
2 | sys-2020-263312190 | customer-191121 | JDG-00058390 | 634.04 | 634.04 | 渠道-76 | 微 信 | 2020-08-14 09:40:34 | 2021-01-01 14:47:14 | True |
3 | sys-2020-188208169 | customer-211918 | JDG-00008241 | 953.73 | 939.28 | 渠道-53 | 薇·信 | 2020-11-02 20:17:25 | 2021-01-19 20:06:35 | False |
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 104557 entries, 1 to 104557 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 orderID 104557 non-null object 1 userID 104557 non-null object 2 goodsID 104557 non-null object 3 orderAmount 104557 non-null float64 4 payment 104557 non-null float64 5 chanelID 104549 non-null object 6 platfromType 104557 non-null object 7 orderTime 104557 non-null datetime64[ns] 8 payTime 104557 non-null datetime64[ns] 9 chargeback 104557 non-null bool dtypes: bool(1), datetime64[ns](2), float64(2), object(5) memory usage: 8.1+ MB
In [ ]:
#2.数据清洗
In [10]:
df.rename(columns={'chanelID':'channelID','platfromType':'platformType','chargeback':'isback'},inplace=True)
In [ ]:
#清洗非2021年的订单
In [12]:
index = df[df.orderTime.dt.year != 2021].index
df.drop(index=index,inplace=True)
df.shape
Out[12]:
(104296, 10)
In [ ]:
#清洗跟业务规则不吻合的订单
In [13]:
index = df[(df.orderAmount < 0) | (df.payment < 0) | (df.payTime < df.orderTime)].index
df.drop(index=index,inplace=True)
df.shape
Out[13]:
(104281, 10)
In [15]:
index = df[(df.payTime - df.orderTime).dt.total_seconds() > 1800].index
df.drop(index=index,inplace=True)
df.shape
Out[15]:
(103344, 10)
In [16]:
# 计算平均折扣
a,b = df.query('payment <= orderAmount')[['orderAmount','payment']].sum()
mean_dis_rate = b / a
mean_dis_rate
Out[16]:
0.9540948923974325
In [ ]:
# 将支付金额大于订单金额的订单金额修改为订单金额*平均折扣
In [17]:
df['payment'] = np.where(df.payment <= df.orderAmount,df.payment,(df.orderAmount * mean_dis_rate).round(2))
In [ ]:
df
In [10]:
# 处理空值
channel_mode = df.channelID.mode()[0] #众数
df['channelID'] = df.channelID.fillna(channel_mode) #用众数填充空值
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 103344 entries, 6 to 104301 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 orderID 103344 non-null object 1 userID 103344 non-null object 2 goodsID 103344 non-null object 3 orderAmount 103344 non-null float64 4 payment 103344 non-null float64 5 channelID 103344 non-null object 6 platformType 103344 non-null object 7 orderTime 103344 non-null datetime64[ns] 8 payTime 103344 non-null datetime64[ns] 9 isback 103344 non-null bool dtypes: bool(1), datetime64[ns](2), float64(2), object(5) memory usage: 8.0+ MB
In [ ]:
#处理异常值
In [18]:
df['platformType'] = df.platformType.replace(
r'[.\s]','',regex=True
).str.title().replace(
'·','',regex=True
).replace(
'薇信|Vx','微信',regex=True
).replace(
'网站|网页','Web',regex=True
)
In [19]:
df.platformType.unique()
Out[19]:
array(['微信', 'App', '支付宝', 'Web'], dtype=object)
In [20]:
#数据预处理
def num_to_weekday(num):
items = ['一','二','三','四','五','六','日']
return f'星期{items[num]}'
df['month'] = df.orderTime.dt.strftime('%m月')
df['weekday'] = df.orderTime.dt.weekday.map(num_to_weekday)
df['seg'] = df.orderTime.dt.floor('30min').dt.strftime('%H:%M')
df.head(3)
Out[20]:
orderID | userID | goodsID | orderAmount | payment | channelID | platformType | orderTime | payTime | isback | month | weekday | seg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||
6 | sys-2021-279103297 | customer-146548 | JDG-00056475 | 667.33 | 667.33 | 渠道-76 | 微信 | 2021-01-01 00:12:23 | 2021-01-01 00:13:37 | False | 01月 | 星期五 | 00:00 |
7 | sys-2021-316686066 | customer-104210 | JDG-00070902 | 1834.13 | 1776.80 | 渠道-39 | 微信 | 2021-01-01 00:23:06 | 2021-01-01 00:23:32 | False | 01月 | 星期五 | 00:00 |
8 | sys-2021-306447069 | customer-104863 | JDG-00049963 | 837.32 | 818.33 | 渠道-00 | 微信 | 2021-01-01 01:05:50 | 2021-01-01 01:06:17 | False | 01月 | 星期五 | 01:00 |
任务一:统计GMV/净销售额/客单价/AOV/拒退率/复购率指标¶
In [21]:
GMV= df.orderAmount.sum()
sales = df.query('not isback').payment.sum()
ARPPU = sales / df.userID.nunique()
AOV = sales / df.orderID.nunique()
back_rate = df.query('isback').orderID.nunique() / df.orderID.nunique()
temp = df.query('not isback').groupby('userID').orderID.nunique()
fu = temp[temp > 1].count() / temp.count()
In [22]:
print(f"""
GMV:¥{GMV:,.2f}元
净销售额:¥{sales:,.2f}元
客单价:¥{ARPPU:,.2f}元
AOV:¥{AOV:,.2f}元
拒退率:{back_rate:.2%}
复购率:{fu:.2%}
""")
GMV:¥127,098,751.96元 净销售额:¥105,127,546.94元 客单价:¥1,336.65元 AOV:¥1,017.52元 拒退率:13.18% 复购率:22.60%
In [ ]:
# 任务二:统计出月度销售额 月环比 并绘制出折线图
In [23]:
temp1 = df.query('not isback').groupby('month').payment.sum().to_frame()
temp1['月环比'] = temp1.payment.pct_change()
temp1.style.format(
formatter={
'payment':'{:,.2f}',
'月环比':'{:.2%}'
},
na_rep='-----'
)
Out[23]:
payment | 月环比 | |
---|---|---|
month | ||
01月 | 6,769,318.22 | ----- |
02月 | 5,364,705.92 | -20.75% |
03月 | 6,590,363.88 | 22.85% |
04月 | 7,067,771.80 | 7.24% |
05月 | 9,701,036.23 | 37.26% |
06月 | 10,377,133.88 | 6.97% |
07月 | 9,799,773.23 | -5.56% |
08月 | 10,039,151.28 | 2.44% |
09月 | 9,393,501.29 | -6.43% |
10月 | 9,021,093.71 | -3.96% |
11月 | 10,611,667.01 | 17.63% |
12月 | 10,392,030.49 | -2.07% |
In [120]:
temp1.plot(
figsize=(8,4),
kind='line',
y='payment'
)
plt.ylim(0,14000000)
plt.xticks(np.arange(12),temp1.index)
plt.show()
In [123]:
#%pip install pyecharts
In [124]:
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
In [128]:
from pyecharts.charts import Bar,Line
from pyecharts import options as opts
from pyecharts.globals import ThemeType
In [156]:
x1 = temp1.index.values.tolist()
y1 = (temp1.payment / 1e6).round(2).values.tolist()
In [157]:
line = Line(init_opts=opts.InitOpts(width='720px', height='360px'))
line.set_global_opts(
title_opts=opts.TitleOpts(
title='2021年月度净销售额',
title_link='http://qfedu.com',
pos_left='center',
pos_top='5%',
title_textstyle_opts=opts.TextStyleOpts(
font_family='苹方',
font_size=18,
color='red',
)
),
toolbox_opts=opts.ToolboxOpts(
is_show=True,
orient='vertical',
pos_top='center',
)
)
line.add_xaxis(x1)
line.add_yaxis(
'',
y1,
markline_opts=opts.MarkLineOpts(
data=[opts.MarkLineItem(type_='average')]
),
)
line.load_javascript()
line.render_notebook()
Out[157]:
In [ ]:
#任务三,统计出周一到周日每天的下单量
In [164]:
temp2 = df.query('not isback').groupby('weekday').orderID.count()
temp2
Out[164]:
weekday 星期一 11425 星期三 13233 星期二 12466 星期五 14308 星期六 12515 星期四 14448 星期日 11332 Name: orderID, dtype: int64
In [165]:
bar = Bar()
bar.add_xaxis(temp2.index.values.tolist())
bar.add_yaxis("销量", temp2.values.tolist())
bar.render_notebook()
Out[165]:
In [166]:
#任务四:统计每天各个时段的下单量
In [169]:
temp3 = df.groupby('seg').orderID.nunique()
temp3
Out[169]:
seg 00:00 1064 00:30 632 01:00 382 01:30 267 02:00 139 02:30 109 03:00 72 03:30 50 04:00 33 04:30 18 05:00 20 05:30 43 06:00 68 06:30 101 07:00 182 07:30 268 08:00 418 08:30 594 09:00 929 09:30 1133 10:00 1663 10:30 1860 11:00 2352 11:30 3044 12:00 4185 12:30 5402 13:00 6654 13:30 6492 14:00 5750 14:30 4779 15:00 3669 15:30 2847 16:00 2374 16:30 2135 17:00 2085 17:30 2140 18:00 2395 18:30 2876 19:00 3773 19:30 4517 20:00 4822 20:30 4632 21:00 4019 21:30 3475 22:00 2883 22:30 2488 23:00 1945 23:30 1566 Name: orderID, dtype: int64
In [170]:
bar = Bar()
bar.add_xaxis(temp3.index.values.tolist())
bar.add_yaxis('', temp3.values.tolist(), label_opts=opts.LabelOpts(is_show=False))
bar.load_javascript()
Out[170]:
In [171]:
bar.render_notebook()
Out[171]:
In [ ]:
# 统计月度复购率(以自然月为时间窗口统计复购用户比例)
In [21]:
temp4 = pd.pivot_table(
df.query('not isback'),
index='userID',
columns='month',
values='orderID',
aggfunc='nunique'
)
result = (temp4[temp4 > 1 ].count() / temp4.count()).to_frame()
result.rename(columns={0:'复购率'}).style.format(formatter='{:.2%}')
Out[21]:
复购率 | |
---|---|
month | |
01月 | 1.47% |
02月 | 0.92% |
03月 | 1.58% |
04月 | 1.74% |
05月 | 2.64% |
06月 | 2.64% |
07月 | 2.08% |
08月 | 2.71% |
09月 | 2.19% |
10月 | 2.49% |
11月 | 2.69% |
12月 | 2.62% |
In [ ]:
# 构建RFM模型做用户价值分群
In [47]:
temp5 = pd.pivot_table(
df.query('not isback'),
index='userID',
values=['orderTime','orderID','payment'],
aggfunc={
'orderTime':'max',
'orderID':'nunique',
'payment':'sum'
}
)
In [46]:
from datetime import datetime
In [48]:
ref_date = datetime(2021,12,31)
temp5['R'] = (ref_date - temp5.orderTime).dt.days
temp5.rename(columns={'orderID':'F','payment':'M'},inplace=True)
temp5.drop(columns='orderTime',inplace=True)
In [49]:
def r2level(value):
if value <= 14: return 5
elif value <= 30: return 4
elif value <= 90: return 3
elif value <= 180: return 2
return 1
temp5['R'] = temp5.R.map(r2level)
temp5
Out[49]:
F | M | R | |
---|---|---|---|
userID | |||
customer-100000 | 1 | 2109.52 | 3 |
customer-100003 | 1 | 870.17 | 1 |
customer-100006 | 1 | 523.02 | 3 |
customer-100007 | 1 | 2244.56 | 1 |
customer-100008 | 1 | 5018.60 | 3 |
... | ... | ... | ... |
customer-299980 | 1 | 644.00 | 3 |
customer-299983 | 1 | 949.91 | 5 |
customer-299989 | 2 | 2026.52 | 3 |
customer-299992 | 1 | 615.45 | 1 |
customer-299995 | 1 | 467.64 | 1 |
70604 rows × 3 columns
In [50]:
temp5['F'] = temp5.F.map(lambda x: 5 if x >= 5 else x)
temp5
Out[50]:
F | M | R | |
---|---|---|---|
userID | |||
customer-100000 | 1 | 2109.52 | 3 |
customer-100003 | 1 | 870.17 | 1 |
customer-100006 | 1 | 523.02 | 3 |
customer-100007 | 1 | 2244.56 | 1 |
customer-100008 | 1 | 5018.60 | 3 |
... | ... | ... | ... |
customer-299980 | 1 | 644.00 | 3 |
customer-299983 | 1 | 949.91 | 5 |
customer-299989 | 2 | 2026.52 | 3 |
customer-299992 | 1 | 615.45 | 1 |
customer-299995 | 1 | 467.64 | 1 |
70604 rows × 3 columns
In [51]:
def m2level(value):
if value < 300: return 1
elif value < 800: return 2
elif value < 1200: return 3
elif value < 2500: return 4
return 5
temp5['M'] = temp5.M.map(m2level)
temp5
Out[51]:
F | M | R | |
---|---|---|---|
userID | |||
customer-100000 | 1 | 4 | 3 |
customer-100003 | 1 | 3 | 1 |
customer-100006 | 1 | 2 | 3 |
customer-100007 | 1 | 4 | 1 |
customer-100008 | 1 | 5 | 3 |
... | ... | ... | ... |
customer-299980 | 1 | 2 | 3 |
customer-299983 | 1 | 3 | 5 |
customer-299989 | 2 | 4 | 3 |
customer-299992 | 1 | 2 | 1 |
customer-299995 | 1 | 2 | 1 |
70604 rows × 3 columns
In [52]:
temp6 = temp5 > temp5.mean()
In [ ]:
temp5[temp6] = '高'
In [54]:
temp5[~temp6] = '低'
In [55]:
temp5['tag'] = temp5.F + temp5.M + temp5.R
In [56]:
def make_tag(tag):
if tag == "高高高": return "重要价值客户"
elif tag == "高低高" : return "重要发展客户"
elif tag == "低高高" : return "重要保持客户"
elif tag == "低低高" : return "重要挽留客户"
elif tag == "高高低" : return "一般价值客户"
elif tag == "高低低" : return "一般发展客户"
elif tag == "低高低" : return "一般保持客户"
return "一般挽留客户"
In [57]:
temp5['usertype'] = temp5.tag.map(make_tag)
In [58]:
temp5
Out[58]:
F | M | R | tag | usertype | |
---|---|---|---|---|---|
userID | |||||
customer-100000 | 低 | 高 | 高 | 低高高 | 重要保持客户 |
customer-100003 | 低 | 低 | 低 | 低低低 | 一般挽留客户 |
customer-100006 | 低 | 低 | 高 | 低低高 | 重要挽留客户 |
customer-100007 | 低 | 高 | 低 | 低高低 | 一般保持客户 |
customer-100008 | 低 | 高 | 高 | 低高高 | 重要保持客户 |
... | ... | ... | ... | ... | ... |
customer-299980 | 低 | 低 | 高 | 低低高 | 重要挽留客户 |
customer-299983 | 低 | 低 | 高 | 低低高 | 重要挽留客户 |
customer-299989 | 高 | 高 | 高 | 高高高 | 重要价值客户 |
customer-299992 | 低 | 低 | 低 | 低低低 | 一般挽留客户 |
customer-299995 | 低 | 低 | 低 | 低低低 | 一般挽留客户 |
70604 rows × 5 columns
In [60]:
temp7 = temp5.reset_index().groupby('usertype').userID.nunique()
In [61]:
temp7.plot(
figsize=(5, 5),
kind='pie',
ylabel='',
autopct='%.2f%%',
pctdistance=0.8,
wedgeprops={'width':0.35},
)
plt.show()