# %load hello.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'

jobs = pd.read_csv('wenjian/new_jobs.csv')

print(f"""
招聘信息：{jobs.job_name.count()}
职位数量：{jobs.pos_count.sum()}
平均薪资：{jobs.salary.mean().round(2)}
城市数量：{jobs.city.nunique()}
""")

招聘信息：3822
职位数量：13483
平均薪资：16.98
城市数量：9

temp0 = jobs.groupby('city').pos_count.agg(['sum','max','count'])   # groupby分组

temp0.sort_values(by='sum',ascending=False,inplace=True)  # ascending 从小到大排序

temp0

temp1 = jobs.groupby('city').pos_count.sum()

temp1.sort_values(ascending=False,inplace=True)
temp1

city
上海    2151
深圳    2092
北京    2067
杭州    1774
广州    1611
南京    1291
武汉     947
成都     911
西安     639
Name: pos_count, dtype: int64

plt.cm.summer

index = np.linspace(0.25,0.95,9)
colors = plt.cm.summer(index)

temp1.plot(
    figsize =(8,5),
    kind='bar',
    xlabel='',
    color=colors,
    title="每个城市的岗位数量"
)
plt.ylim(0,2500)
plt.xticks(rotation=0)

for i in range(temp1.size):
    plt.text(i,temp1.iloc[i]+20,temp1.iloc[i],ha='center')

plt.show()

temp2 =jobs.groupby('city').salary.agg(['mean','max','min']).round(1)
temp2.rename(columns={'mean':'平均薪资','max':'最高薪资','min':'最低薪资'},inplace=True)
temp2.sort_values(by='平均薪资',ascending=False,inplace=True)

temp2.plot(
    figsize =(8,4),
    kind='bar',
    y='平均薪资',
    color=plt.cm.Oranges(index),
    title='每个城市的平均薪资',
    legend=False,
    xlabel='',
)
plt.ylim(0,25)
plt.xticks(rotation=0)

avg_salary = temp2.平均薪资
for i in range(avg_salary.size):
    plt.text(i,avg_salary.iloc[i]+0.3,avg_salary.iloc[i],ha='center')

plt.show()

#统计不同学历 岗位数量占比

temp3 = jobs.groupby('edu').pos_count.sum()

temp3.plot(
    figsize=(5,5),
    kind='pie',
    ylabel='',
    autopct='%.2f%%',             #自动计算并显示百分比格式
    pctdistance=0.85,             #距离圆心的距离 
    wedgeprops={'width':0.38}     #变成环状饼图
)
plt.show()

#工作年限占比

temp4 = jobs.groupby('year').pos_count.sum()
temp4

year
1-3年    3851
1年以内     635
3-5年    5122
5年以上    1549
应届生      277
经验不限    2049
Name: pos_count, dtype: int64

temp4.plot(
    figsize=(5,5),
    kind='pie',
    ylabel='',
    autopct='%.2f%%',             #自动计算并显示百分比格式
    pctdistance=0.85,             #距离圆心的距离 
    wedgeprops={'width':0.38}     #变成环状饼图
)
plt.show()

keywords = input('请输入公司名称:')
jobs[jobs.company_name.str.contains(keywords)][['city','company_name','salary','year','edu']].rename(
    columns={'company_name':'公司名称','salary':'工资','year':'工作年限','edu':'学历','city':'城市'}
).sort_values(by=['城市','工资'],ascending=[True,False])

jobs['year'] = jobs.year.astype('category').cat.reorder_categories(
    ['应届生','1年以内','经验不限','1-3年','3-5年','5年以上'])
jobs['edu'] = jobs.edu.astype('category').cat.reorder_categories(
    ['学历不限','大专','本科','研究生'])

temp5 = jobs.groupby(['edu','year'],as_index=False).salary.mean().round(1)
#temp5.pivot(index='edu',columns='year',values='salary')
pd.pivot(temp5,index='edu',columns='year',values='salary')

temp6 = jobs.pivot_table(
    index='edu',
    columns='year',
    values='salary',
    aggfunc='mean',
    observed=True,
    fill_value=0
).round(1)
temp6

plt.imshow(
    temp6,
    cmap='Greens'
)
plt.xticks(np.arange(6),labels=temp6.columns)
plt.yticks(np.arange(4),labels=temp6.index)
ref_vales = np.quantile(temp6.values,0.75)

for x in range(temp6.index.size):
    for y in range(temp6.columns.size):
        value = temp6.iat[x,y]
        color = 'white' if value > ref_vales else 'black'  #判断值给不同的颜色
        plt.text(y,x,value,ha='center',va='center',color=color)
plt.colorbar()
plt.show()

import seaborn as sns

sns.heatmap(
    temp6,
    cmap='Greens',
    annot=True,
    fmt='.1f'
           
)
plt.xlabel('')
plt.ylabel('')
plt.yticks(rotation=0)
plt.show()

def make_tag(price):
    if price < 300 :return '低端'
    elif price < 800 :return '中端'
    return '高端'

data = pd.read_excel('wenjian/2020年销售数据.xlsx',sheet_name='data')
data

reps = pd.read_excel('wenjian/2020年销售数据.xlsx',sheet_name='reps')
sx = pd.merge(data,reps,how='inner',left_on='销售代表',right_on='工号')
sx['销售额'] = sx.售价 * sx.销售数量
sx['毛利润'] = sx.销售额 - sx.直接成本
sx['价位'] = sx.售价.map(make_tag).astype('category').cat.reorder_categories(
    ['低端','中端','高端'])
sx['月份'] = sx.销售日期.dt.strftime('%m月')
#sx.drop(columns='工号',inplace=True)
sx.head(3)

# 统计销售额，毛利润，毛利率，订单数量，销售数量，AOV，人效

总销售额 = sx.销售额.sum()
毛利润 = sx.毛利润.sum()
毛利率 = 毛利润/总销售额
总订单数量 = sx.销售订单.nunique()
总销售数量 = sx.销售数量.sum()
AOV = 总销售额 / 总订单数量
人效 = 总销售额 / sx.销售代表.nunique()

print(f'''
总销售额 = ￥{总销售额:,}元
毛利润 = ￥{毛利润:,}元
毛利率 = {毛利率:.2%}
总订单数量 = {总订单数量:,}单
总销售数量 = {总销售数量:,}件
AOV = ￥{AOV:,.0f}元
人效 = ￥{人效:,.0f}元
''')

总销售额 = ￥39,772,087元
毛利润 = ￥27,926,715元
毛利率 = 70.22%
总订单数量 = 1,940单
总销售数量 = 107,403件
AOV = ￥20,501元
人效 = ￥2,840,863元

#统计每个月的销售额和月环比

temp1 = sx.groupby('月份').销售额.sum().to_frame()
temp1['月环比'] = temp1.销售额.pct_change()
temp1.style.format(
    formatter={
        '销售额':'{:,}',
        '月环比':'{:.2%}'
    },
    na_rep='-----'
).background_gradient(cmap='Reds')

temp1.plot(
    figsize =(10,4.5),   #图表尺寸
    kind='line',         #类型
    y='销售额',          #Y轴数据
    xlabel='',           #X轴标签
    legend=False,        #图例
    title='每个月份销售额', 
    marker='>',
    linestyle='--'         #线条样式
)
plt.ylim(0,6000000)
plt.ticklabel_format(axis='y',style='plain')
plt.xticks(np.arange(12),labels=temp1.index)
plt.grid(axis='y',linestyle='--',alpha=0.25)

y1= temp1.销售额
y2= temp1.月环比
for x in range(1,y2.size):
    plt.text(x,y1.iloc[x]+200000,f'{y2.iloc[x]:.2%}',ha='center')
plt.show()

#统计各品牌销售额贡献占比

temp2 = sx.groupby('品牌').销售额.sum()
temp2

品牌
八匹马      5334646
啊哟喂       693159
壁虎       4678979
皮皮虾      7892271
花花姑娘    21173032
Name: 销售额, dtype: int64

temp2.plot(
    figsize=(5,5),
    kind='pie',
    ylabel='',
    autopct='%.2f%%',
    pctdistance=0.82,
    explode=[0,0,0,0.1,0],
    wedgeprops={'width':0.38,'edgecolor':'w'}
)
plt.show()

temp3 = sx.groupby('销售渠道').销售额.sum()
temp3

销售渠道
京东      7694468
天猫     13711818
实体      5194293
抖音      4782905
拼多多     8388603
Name: 销售额, dtype: int64

temp3.plot(
    figsize=(5,5),
    kind='pie',
    ylabel='',
    autopct='%.2f%%',
    pctdistance=0.82,
    wedgeprops={'width':0.38}
)
plt.show()

#统计各销售区域月度销售额
temp4 = pd.pivot_table(
    sx,
    index='销售区域',
    columns='月份',
    values='销售额',
    aggfunc='sum',
    fill_value=0,
    # margins=True,    #小计
    # margins_name= '总计' #小计改名字
)

temp4.T.plot(
    figsize=(8,4),
    kind='bar',
    stacked=True,
    xlabel="",
)
plt.xticks(rotation=0)
plt.show()

# 统计各渠道各品牌的销售数量

temp5=pd.pivot_table(
    sx,
    index='销售渠道',
    columns='品牌',
    values='销售数量',
    aggfunc='sum',
    observed=True
    
)
temp6 = temp5.div(temp5.sum(axis=1),axis=0)
temp6.plot(
    kind='bar',
    stacked=True
)
plt.xticks(rotation=0)
plt.show()

# 统计各价位产品每个月的销售数量

temp6=pd.pivot_table(
    sx,
    index='价位',
    columns='月份',
    values='销售数量',
    aggfunc='sum',
    observed=True
)
temp6.T.plot(
    figsize=(8,4),
    kind='line',
    stacked=True,
    xlabel="",
)
plt.xticks(rotation=0)
plt.show()

# 统计业绩最好的三个销售代表 (TOP N)

temp7 = sx.groupby(['销售代表','姓名']).销售额.sum().to_frame()
#temp7.sort_values(ascending=False).head(3)
temp7.nlargest(3,'销售额').reset_index(level=1).style.hide()  #最大的N个

# 统计每个月业绩最好的三个销售代表

# temp8 = sx.groupby(['月份','销售代表','姓名']).销售额.sum()
# temp8.groupby('月份').nlargest(3)

temp8 = sx.groupby(['月份','销售代表','姓名']).销售额.sum().to_frame()
temp8['排名'] = temp8.groupby('月份').rank('dense',ascending=False).astype('i8')
temp8.query('排名 <=3').reset_index().sort_values(by=['月份','排名']).set_index(['月份'])

year	应届生	1年以内	经验不限	1-3年	3-5年	5年以上
edu
学历不限	11.5	6.0	11.0	11.7	17.7	20.8
大专	6.7	7.0	7.4	9.5	15.6	26.2
本科	7.0	9.0	12.4	13.4	20.9	28.1
研究生	8.3	13.5	15.4	17.9	25.9	31.3

	销售日期	销售区域	销售渠道	销售订单	品牌	售价	销售数量	直接成本	销售代表
0	2020-01-01	上海	拼多多	200101007627	八匹马	99	83	3351	S00982
1	2020-01-01	上海	抖音	200101005623	八匹马	219	29	1016	S00871
2	2020-01-01	上海	天猫	200101004554	八匹马	169	85	6320	S00871
3	2020-01-01	上海	天猫	200101009600	八匹马	169	14	485	S00272
4	2020-01-01	上海	天猫	200101007986	皮皮虾	249	61	2452	S00272
...	...	...	...	...	...	...	...	...	...
1940	2020-12-30	北京	京东	201230004052	花花姑娘	269	26	1560	S00344
1941	2020-12-30	福建	实体	201230007101	八匹马	79	97	3028	S00677
1942	2020-12-31	福建	实体	201231009600	花花姑娘	269	55	2277	S00604
1943	2020-12-31	福建	抖音	201231003362	八匹马	59	59	852	S00272
1944	2020-12-31	福建	天猫	201231003844	八匹马	99	27	435	S00272

	销售额	月环比
月份
01月	5,409,855	-----
02月	4,608,455	-14.81%
03月	4,164,972	-9.62%
04月	3,996,770	-4.04%
05月	3,239,005	-18.96%
06月	2,817,936	-13.00%
07月	3,501,304	24.25%
08月	2,948,189	-15.80%
09月	2,632,960	-10.69%
10月	2,375,385	-9.78%
11月	2,385,283	0.42%
12月	1,691,973	-29.07%

姓名	销售额
李华志	3860877
王梅梅	3592423
刘旭	3515728

	销售代表	姓名	销售额	排名
月份
01月	S00710	李华志	675065	1
01月	S00604	赵治国	652302	2
01月	S00871	刘旭	558124	3
02月	S00710	李华志	740887	1
02月	S00169	冯兰	517864	2
02月	S00133	王梅梅	435794	3
03月	S00757	王默	449747	1
03月	S00169	冯兰	402668	2
03月	S00982	李欢	390944	3
04月	S00677	李平	537014	1
04月	S00344	张彩	445697	2
04月	S00133	王梅梅	411993	3
05月	S00677	李平	592075	1
05月	S00133	王梅梅	417171	2
05月	S00710	李华志	294061	3
06月	S00757	王默	429097	1
06月	S00169	冯兰	290356	2
06月	S00378	蔡中平	273828	3
07月	S00272	周小敏	556611	1
07月	S00133	王梅梅	432678	2
07月	S00710	李华志	412358	3
08月	S00363	吴国华	322931	1
08月	S00604	赵治国	317701	2
08月	S00169	冯兰	310072	3
09月	S00272	周小敏	350258	1
09月	S00871	刘旭	327426	2
09月	S00133	王梅梅	289976	3
10月	S00272	周小敏	309373	1
10月	S00169	冯兰	226022	2
10月	S00871	刘旭	224994	3
11月	S00188	杨明轩	345742	1
11月	S00344	张彩	265178	2
11月	S00871	刘旭	238296	3
12月	S00871	刘旭	238091	1
12月	S00363	吴国华	227479	2
12月	S00378	蔡中平	208448	3

Python数据分析第5天 - 深入浅出pandas¶

案例 2020销售数据¶

	sum	max	count
city
上海	2151	6	589
北京	2067	6	584
南京	1291	6	371
广州	1611	6	460
成都	911	6	248
杭州	1774	6	517
武汉	947	6	273
深圳	2092	6	607
西安	639	6	173

	城市	公司名称	工资	工作年限	学历
2248	上海	通联数据	27.5	3-5年	本科
2271	上海	达观数据	25.0	3-5年	本科
2251	上海	达观数据	22.5	经验不限	本科
2385	上海	上海源石数据	22.5	3-5年	大专
2750	上海	哈步数据	22.5	3-5年	本科
...	...	...	...	...	...
2874	深圳	惠群商业数据	7.0	1年以内	本科
3281	深圳	前海数据	4.0	应届生	本科
3789	西安	廷山大数据	9.5	经验不限	本科
3746	西安	熊赳赳数据科技	4.5	经验不限	学历不限
3777	西安	麦仓数据	4.5	经验不限	大专