# %load hello.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'

项目1：电商订单分析

#1.加载数据

df = pd.read_excel('wenjian/某电商平台2021年订单数据.xlsx',index_col='id')
df.head(3)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104557 entries, 1 to 104557
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   orderID       104557 non-null  object        
 1   userID        104557 non-null  object        
 2   goodsID       104557 non-null  object        
 3   orderAmount   104557 non-null  float64       
 4   payment       104557 non-null  float64       
 5   chanelID      104549 non-null  object        
 6   platfromType  104557 non-null  object        
 7   orderTime     104557 non-null  datetime64[ns]
 8   payTime       104557 non-null  datetime64[ns]
 9   chargeback    104557 non-null  bool          
dtypes: bool(1), datetime64[ns](2), float64(2), object(5)
memory usage: 8.1+ MB

#2.数据清洗

df.rename(columns={'chanelID':'channelID','platfromType':'platformType','chargeback':'isback'},inplace=True)

#清洗非2021年的订单

index = df[df.orderTime.dt.year != 2021].index
df.drop(index=index,inplace=True)
df.shape

(104296, 10)

#清洗跟业务规则不吻合的订单

index = df[(df.orderAmount < 0) | (df.payment < 0) | (df.payTime < df.orderTime)].index
df.drop(index=index,inplace=True)
df.shape

(104281, 10)

index = df[(df.payTime - df.orderTime).dt.total_seconds() > 1800].index
df.drop(index=index,inplace=True)
df.shape

(103344, 10)

# 计算平均折扣
a,b = df.query('payment <= orderAmount')[['orderAmount','payment']].sum()
mean_dis_rate = b / a
mean_dis_rate

0.9540948923974325

# 将支付金额大于订单金额的订单金额修改为订单金额*平均折扣

df['payment'] = np.where(df.payment <= df.orderAmount,df.payment,(df.orderAmount * mean_dis_rate).round(2))

df

# 处理空值
channel_mode = df.channelID.mode()[0]   #众数
df['channelID'] = df.channelID.fillna(channel_mode)   #用众数填充空值
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103344 entries, 6 to 104301
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   orderID       103344 non-null  object        
 1   userID        103344 non-null  object        
 2   goodsID       103344 non-null  object        
 3   orderAmount   103344 non-null  float64       
 4   payment       103344 non-null  float64       
 5   channelID     103344 non-null  object        
 6   platformType  103344 non-null  object        
 7   orderTime     103344 non-null  datetime64[ns]
 8   payTime       103344 non-null  datetime64[ns]
 9   isback        103344 non-null  bool          
dtypes: bool(1), datetime64[ns](2), float64(2), object(5)
memory usage: 8.0+ MB

#处理异常值

df['platformType'] = df.platformType.replace(
    r'[.\s]','',regex=True
).str.title().replace(
    '·','',regex=True
).replace(
    '薇信|Vx','微信',regex=True
).replace(
    '网站|网页','Web',regex=True
)

df.platformType.unique()

array(['微信', 'App', '支付宝', 'Web'], dtype=object)

#数据预处理
def num_to_weekday(num):
    items = ['一','二','三','四','五','六','日']
    return f'星期{items[num]}'
df['month'] = df.orderTime.dt.strftime('%m月')
df['weekday'] = df.orderTime.dt.weekday.map(num_to_weekday)
df['seg'] = df.orderTime.dt.floor('30min').dt.strftime('%H:%M')
df.head(3)

GMV= df.orderAmount.sum()
sales = df.query('not isback').payment.sum()
ARPPU = sales / df.userID.nunique()
AOV = sales / df.orderID.nunique()
back_rate = df.query('isback').orderID.nunique() / df.orderID.nunique()
temp = df.query('not isback').groupby('userID').orderID.nunique()
fu = temp[temp > 1].count()  / temp.count()

print(f"""
GMV：￥{GMV:,.2f}元
净销售额：￥{sales:,.2f}元
客单价：￥{ARPPU:,.2f}元
AOV：￥{AOV:,.2f}元
拒退率：{back_rate:.2%}
复购率：{fu:.2%}
""")

GMV：￥127,098,751.96元
净销售额：￥105,127,546.94元
客单价：￥1,336.65元
AOV：￥1,017.52元
拒退率：13.18%
复购率：22.60%

# 任务二：统计出月度销售额 月环比 并绘制出折线图

temp1 = df.query('not isback').groupby('month').payment.sum().to_frame()
temp1['月环比'] = temp1.payment.pct_change()
temp1.style.format(
    formatter={
        'payment':'{:,.2f}',
        '月环比':'{:.2%}'
    },
    na_rep='-----'
)

temp1.plot(
    figsize=(8,4),
    kind='line',
    y='payment'
)
plt.ylim(0,14000000)
plt.xticks(np.arange(12),temp1.index)
plt.show()

#%pip install pyecharts

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

from pyecharts.charts import Bar,Line
from pyecharts import options as opts
from pyecharts.globals import ThemeType

x1 = temp1.index.values.tolist()
y1 =  (temp1.payment / 1e6).round(2).values.tolist()

line = Line(init_opts=opts.InitOpts(width='720px', height='360px'))
line.set_global_opts(
    title_opts=opts.TitleOpts(
        title='2021年月度净销售额',
        title_link='http://qfedu.com',
        pos_left='center',
        pos_top='5%',
        title_textstyle_opts=opts.TextStyleOpts(
            font_family='苹方',
            font_size=18,
            color='red',
        )
    ),
    toolbox_opts=opts.ToolboxOpts(
        is_show=True,
        orient='vertical',
        pos_top='center',
    )
)
line.add_xaxis(x1)
line.add_yaxis(
    '',
    y1,
    markline_opts=opts.MarkLineOpts(
        data=[opts.MarkLineItem(type_='average')]
    ),
)
line.load_javascript()
line.render_notebook()

#任务三，统计出周一到周日每天的下单量

temp2 = df.query('not isback').groupby('weekday').orderID.count()
temp2

weekday
星期一    11425
星期三    13233
星期二    12466
星期五    14308
星期六    12515
星期四    14448
星期日    11332
Name: orderID, dtype: int64

bar = Bar()
bar.add_xaxis(temp2.index.values.tolist())
bar.add_yaxis("销量", temp2.values.tolist())
bar.render_notebook()

#任务四：统计每天各个时段的下单量

temp3 = df.groupby('seg').orderID.nunique()
temp3

seg
00:00    1064
00:30     632
01:00     382
01:30     267
02:00     139
02:30     109
03:00      72
03:30      50
04:00      33
04:30      18
05:00      20
05:30      43
06:00      68
06:30     101
07:00     182
07:30     268
08:00     418
08:30     594
09:00     929
09:30    1133
10:00    1663
10:30    1860
11:00    2352
11:30    3044
12:00    4185
12:30    5402
13:00    6654
13:30    6492
14:00    5750
14:30    4779
15:00    3669
15:30    2847
16:00    2374
16:30    2135
17:00    2085
17:30    2140
18:00    2395
18:30    2876
19:00    3773
19:30    4517
20:00    4822
20:30    4632
21:00    4019
21:30    3475
22:00    2883
22:30    2488
23:00    1945
23:30    1566
Name: orderID, dtype: int64

bar = Bar()
bar.add_xaxis(temp3.index.values.tolist())
bar.add_yaxis('', temp3.values.tolist(), label_opts=opts.LabelOpts(is_show=False))
bar.load_javascript()

bar.render_notebook()

# 统计月度复购率（以自然月为时间窗口统计复购用户比例）

temp4 = pd.pivot_table(
    df.query('not isback'),
    index='userID',
    columns='month',
    values='orderID',
    aggfunc='nunique'
)
result = (temp4[temp4 > 1 ].count() / temp4.count()).to_frame()
result.rename(columns={0:'复购率'}).style.format(formatter='{:.2%}')

# 构建RFM模型做用户价值分群

temp5 = pd.pivot_table(
    df.query('not isback'),
    index='userID',
    values=['orderTime','orderID','payment'],
    aggfunc={
        'orderTime':'max',
        'orderID':'nunique',
        'payment':'sum'
    }
)

from datetime import datetime

ref_date = datetime(2021,12,31)
temp5['R'] = (ref_date - temp5.orderTime).dt.days
temp5.rename(columns={'orderID':'F','payment':'M'},inplace=True)
temp5.drop(columns='orderTime',inplace=True)

def r2level(value):
    if value <= 14: return 5
    elif value <= 30: return 4
    elif value <= 90: return 3
    elif value <= 180: return 2
    return 1


temp5['R'] = temp5.R.map(r2level)
temp5

temp5['F'] = temp5.F.map(lambda x: 5 if x >= 5 else x)
temp5

def m2level(value):
    if value < 300: return 1
    elif value < 800: return 2
    elif value < 1200: return 3
    elif value < 2500: return 4
    return 5


temp5['M'] = temp5.M.map(m2level)
temp5

temp6 = temp5 > temp5.mean()

temp5[temp6] = '高'

temp5[~temp6] = '低'

temp5['tag'] =  temp5.F + temp5.M + temp5.R

def make_tag(tag):
    if tag == "高高高": return "重要价值客户"
    elif tag == "高低高" : return "重要发展客户"
    elif tag == "低高高" : return "重要保持客户"
    elif tag == "低低高" : return "重要挽留客户"
    elif tag == "高高低" : return "一般价值客户"
    elif tag == "高低低" : return "一般发展客户"
    elif tag == "低高低" : return "一般保持客户"
    return "一般挽留客户"

temp5['usertype'] = temp5.tag.map(make_tag)

temp5

temp7 = temp5.reset_index().groupby('usertype').userID.nunique()

temp7.plot(
    figsize=(5, 5),
    kind='pie',
    ylabel='',
    autopct='%.2f%%',
    pctdistance=0.8,
    wedgeprops={'width':0.35},
)

plt.show()

	orderID	userID	goodsID	orderAmount	payment	chanelID	platfromType	orderTime	payTime	chargeback
id
1	sys-2020-254118088	customer-157213	JDG-00006491	495.67	495.67	渠道-39	APP	2020-02-14 12:20:36	2021-02-28 13:38:41	False
2	sys-2020-263312190	customer-191121	JDG-00058390	634.04	634.04	渠道-76	微信	2020-08-14 09:40:34	2021-01-01 14:47:14	True
3	sys-2020-188208169	customer-211918	JDG-00008241	953.73	939.28	渠道-53	薇·信	2020-11-02 20:17:25	2021-01-19 20:06:35	False

	orderID	userID	goodsID	orderAmount	payment	channelID	platformType	orderTime	payTime	isback	month	weekday	seg
id
6	sys-2021-279103297	customer-146548	JDG-00056475	667.33	667.33	渠道-76	微信	2021-01-01 00:12:23	2021-01-01 00:13:37	False	01月	星期五	00:00
7	sys-2021-316686066	customer-104210	JDG-00070902	1834.13	1776.80	渠道-39	微信	2021-01-01 00:23:06	2021-01-01 00:23:32	False	01月	星期五	00:00
8	sys-2021-306447069	customer-104863	JDG-00049963	837.32	818.33	渠道-00	微信	2021-01-01 01:05:50	2021-01-01 01:06:17	False	01月	星期五	01:00

	payment	月环比
month
01月	6,769,318.22	-----
02月	5,364,705.92	-20.75%
03月	6,590,363.88	22.85%
04月	7,067,771.80	7.24%
05月	9,701,036.23	37.26%
06月	10,377,133.88	6.97%
07月	9,799,773.23	-5.56%
08月	10,039,151.28	2.44%
09月	9,393,501.29	-6.43%
10月	9,021,093.71	-3.96%
11月	10,611,667.01	17.63%
12月	10,392,030.49	-2.07%

	复购率
month
01月	1.47%
02月	0.92%
03月	1.58%
04月	1.74%
05月	2.64%
06月	2.64%
07月	2.08%
08月	2.71%
09月	2.19%
10月	2.49%
11月	2.69%
12月	2.62%

	F	M	R
userID
customer-100000	1	2109.52	3
customer-100003	1	870.17	1
customer-100006	1	523.02	3
customer-100007	1	2244.56	1
customer-100008	1	5018.60	3
...	...	...	...
customer-299980	1	644.00	3
customer-299983	1	949.91	5
customer-299989	2	2026.52	3
customer-299992	1	615.45	1
customer-299995	1	467.64	1

	F	M	R	tag	usertype
userID
customer-100000	低	高	高	低高高	重要保持客户
customer-100003	低	低	低	低低低	一般挽留客户
customer-100006	低	低	高	低低高	重要挽留客户
customer-100007	低	高	低	低高低	一般保持客户
customer-100008	低	高	高	低高高	重要保持客户
...	...	...	...	...	...
customer-299980	低	低	高	低低高	重要挽留客户
customer-299983	低	低	高	低低高	重要挽留客户
customer-299989	高	高	高	高高高	重要价值客户
customer-299992	低	低	低	低低低	一般挽留客户
customer-299995	低	低	低	低低低	一般挽留客户

任务一：统计GMV/净销售额/客单价/AOV/拒退率/复购率指标¶