# %load hello.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'].insert(0, 'SimHei')
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'

#方法一：构造器 + 二维数组

scores = np.random.randint(60,101,(5,3))
scores

array([[ 61,  60,  68],
       [ 81,  87,  93],
       [ 89,  81,  83],
       [ 81, 100,  75],
       [ 91,  85,  83]], dtype=int32)

df1 = pd.DataFrame(
    data=scores,
    columns=['Verbal','Math','English'],
    index=np.arange(1001,1006)
)
df1

#方法二：构造器+ 字典

scores_dict = {
    'Verbal':scores[:,0],
    'Math':scores[:,1],
    'English':scores[:,2],
}
scores_dict

{'Verbal': array([61, 81, 89, 81, 91], dtype=int32),
 'Math': array([ 60,  87,  81, 100,  85], dtype=int32),
 'English': array([68, 93, 83, 75, 83], dtype=int32)}

df2 = pd.DataFrame(
    data=scores_dict,
    index=np.arange(1001,1006)
)
df2

#方法三，从csv文件中加载数据

df3 = pd.read_csv(
    'wenjian/2023年北京积分落户数据.csv',
    # sep=',',                    #分隔符，默认是逗号可不写
    # header=0,                   # 表头所在的行默认值是0
    # encoding='utf-8',           # 指定编码
    # index_col='公示编号',        #充当行索引的列
    # usecols=['公示编号','姓名','积分分值'], #指定加载的列
    # nrows=100,                  #加载行数
    # skiprows=np.arange(1,21),   #跳过的行范围
    # quotechar='"',              #包裹字符串的的字符（默认是双引号）
    # true_values=['Yes','Y','真','是'], #被视为布尔True的值
    # false_values=['No','N','伪','否'], #被视为布尔False的值
    # na_values=['N/A','---'],    #被视为空值的值
)
df3

pd.read_csv(
    'wenjian/bilibili.csv',
    encoding='gbk', 
)
#df4

%pip install pyarrow

df4 = pd.read_csv(
    'wenjian/big_data_file.csv.gz',
    # low_memory=False,     #低内存模式
    engine='pyarrow',       #更换引擎，速度更快
)
df4

iter_obj = pd.read_csv(
    'wenjian/big_data_file.csv.gz',
    iterator=True,    #开启迭代器模式
    chunksize=50000,  #每次加载的数据量
)

next(iter_obj)

# 方法四：从excel加载数据创建dataframe对象

df5 = pd.read_excel(
    'wenjian/2020年销售数据.xlsx',
    sheet_name='data',           # 指定数据在哪张表
)
df5

# 方法五：从数据库二维表加载数据

#%pip install pymysql sqlalchemy

import pymysql
conn = pymysql.connect(
    host='47.109.26.237',
    port=3306,
    user='guest',
    password='Guest.618',
    database='hrs',
    charset='utf8mb4',
)
conn

<pymysql.connections.Connection at 0x1b486781340>

pd.read_sql('select * from tb_emp',conn)

from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://guest:Guest.618@47.109.26.237:3306/hrs')
engine

Engine(mysql+pymysql://guest:***@47.109.26.237:3306/hrs)

df6 = pd.read_sql(
    'tb_emp',
    engine,
    index_col=['eno']        
)
df6

engine.connect().close()

#查看信息
df6.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 1359 to 7800
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ename   14 non-null     object 
 1   job     14 non-null     object 
 2   mgr     13 non-null     float64
 3   sal     14 non-null     int64  
 4   comm    6 non-null      float64
 5   dno     14 non-null     int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 2.5 KB

#查看N行  head()/ tail()  前后

#设置索引
df5.set_index(
    np.arange(10001,11946),inplace=True
)
df5

df5.head(3)

df5.tail(5)

#筛选数据

#操作列
df5[['销售区域','品牌']]   # df5.品牌

#添加列
df5['状态'] = ['已完成'] * 1000 + ['未完成'] * 945
df5

#删除列
# df5.drop(columns=['状态','销售代表'])   # 删除后返回结果 并没有修改源数据
# df5.drop(columns=['状态','销售代表'],inplace=True) #inplace=True 直接修改源数据
df5

#获取行
df5.loc[10001]

销售日期           43831
销售区域              上海
销售渠道             拼多多
销售订单    200101007627
品牌               八匹马
售价                99
销售数量              83
直接成本            3351
Name: 10001, dtype: object

df5.iloc[[1,3,5,7]]

df5.loc[10001:10005]

df5.iloc[0:5]

#获取单元格
df5.at[10001,'售价']  # df5.iat[0,5]  x行y列
df5.iat[0,5]

np.int64(99)

#筛选数据
df5[(df5['销售区域'] == '安徽') & (df5['销售渠道'] == '天猫')]

df5.query("销售区域 in ['上海','福建','北京'] and 销售渠道 == '天猫'")
diqu = ['上海','福建','北京']
df5.query("销售区域 in @diqu and 销售渠道 == '天猫'")  #调用变量名前加@

#数据抽样
# replace = False 无放回抽样
# replace = True  有放回抽样
df5.sample(n=120,replace = False)   # 随机抽样

#数据重塑

emp1 = pd.read_sql('tb_emp',engine,index_col='eno')
emp2 = pd.read_sql('tb_emp2',engine,index_col='eno')
dept = pd.read_sql('tb_dept',engine,index_col='dno')

# 拼接（将多张数据结构相同的表拼接在一起） - union
emp = pd.concat((emp1,emp2))
emp

#合并， join
# how - inner / outer /left / right -连表方式
# on / left_on / right_on - 连表使用的字段 
# pd.merge(emp,dept,how='inner',on='dno')  
temp = pd.merge(emp,dept,how='inner',left_on='dno',right_on='dno')

import os

list1 = os.listdir(r'wenjian\jobs\jobs') 
lista = []
for x in list1:
    lista.append(pd.read_csv(fr'wenjian\jobs\jobs\{x}'))
pd.concat(lista).reset_index(drop=True).to_csv('all_jobs.csv',index=False)

#数据清洗
1.缺失值
2.重复值
3.异常值

temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ename   19 non-null     object 
 1   job     19 non-null     object 
 2   mgr     16 non-null     float64
 3   sal     19 non-null     int64  
 4   comm    11 non-null     float64
 5   dno     19 non-null     int64  
 6   dname   19 non-null     object 
 7   dloc    19 non-null     object 
dtypes: float64(2), int64(2), object(4)
memory usage: 1.3+ KB

# 甄别空值     
# isna()   notna() 等同 isnull 和 notnull
temp.notnull()

#删除空值
temp.dropna(axis =1)   #axis=1 删空值的整列  ， 0 删整行

# 填充空值
# temp.fillna(0)   # 查询空值替换为0
temp.comm.fillna(0).astype('i8')

temp['mgr'] = temp.mgr.fillna(-1).astype('i8')
temp

temp.comm.ffill()  #将空值的上面的值向下填充
temp.comm.bfill()  #向上填充

temp['comm']=temp.comm.interpolate().astype('i8') #将插值相邻的值的等距填充

temp.notnull() #检测是否还有空值

temp

#甄别重复值
temp.duplicated(['ename','job'],keep='last')

temp.ename.duplicated(keep='first')

#删除重复值
temp.drop_duplicates(['ename','job'],keep='last',inplace=True)

temp

# 非重复的数量
temp.job.nunique()

10

# 获取非重的值
temp.job.unique()

array(['销售员', '分析师', '设计师', '程序员', '销售主管', '会计', '出纳', '会计师', '总裁', '架构师'],
      dtype=object)

#统计每个职位的数量
temp.job.value_counts()

job
程序员     5
销售员     2
分析师     2
会计      2
设计师     1
销售主管    1
出纳      1
会计师     1
总裁      1
架构师     1
Name: count, dtype: int64

temp.sal.plot(kind='box', whis=3)
plt.show()

# data < Q1 -1.5 * IQR or data > Q3 + 1.5 * IQR
def find_outliers(data,whis=1.5):
    Q1,Q3 = np.quantile(data,[0.25,0.75])
    IQR = Q3 - Q1
    return data[(data < Q1 -whis * IQR) | (data > Q3 + whis * IQR)]
liqun = find_outliers(temp.sal, whis=3)
yichang = liqun.values

array([60000, 30000])

#zscore =(data - mu) / sigma ---> |zscore| >3
def find_outliers2(data):
    mu , sigma = np.mean(data) , np.std(data)
    return data[np.abs((data - mu) / sigma) >3]

find_outliers2(temp.sal)

Series([], Name: sal, dtype: int64)

normal_sal = temp.sal.median()
normal_sal

np.float64(3400.0)

#替换异常值
temp['sal'] = temp.sal.replace(liqun.values,normal_sal)
temp

#删除异常值
temp.drop(index=temp.query('sal in @yichang').index,inplace=True)

jobs = pd.read_csv('all_jobs.csv')
jobs

jobs.drop_duplicates('uri',inplace=True)
jobs.shape

(9777, 9)

jobs.drop(columns=['uri','city'],inplace=True)
jobs.shape

(9777, 7)

jobs['city'] = jobs.site.str.split(expand=True)[0]

jobs.drop(columns='site',inplace=True)

jobs.head(2)

jobs['salary'] = jobs.salary.str.extract(r'(\d+)-(\d+)').astype('i8').mean(axis=1)

jobs['year'] = jobs.year.replace('5-10年|10年以上','5年以上',regex=True)

jobs.year.unique()

array(['经验不限', '3-5年', '5年以上', '1-3年', '应届生', '1年以内'], dtype=object)

jobs['edu'] = jobs.edu.replace('高中|中专','学历不限',regex=True).replace('硕士|博士','研究生',regex=True)

jobs.edu.unique()

array(['本科', '大专', '研究生', '学历不限'], dtype=object)

jobs.job_name.unique()

array(['Python', 'python工程师', 'python开发', ..., '产品经理(薪资：8-12k)',
       '出境旅游产品经理', '产品经理（农药）'], shape=(2739,), dtype=object)

jobs['job_name'] = jobs.job_name.str.lower()

results = jobs[jobs.job_name.str.contains('python|数据|ai|bi',regex=True)]
results.to_csv('wenjian/new_jobs.csv',index=False)

	公示编号	姓名	出生年月	单位名称	积分分值
0	202300001	张浩	1977-02	北京首钢股份有限公司	140.05
1	202300002	冯云	1982-02	中国人民解放军空军二十三厂	134.29
2	202300003	王天东	1975-01	中建二局第三建筑工程有限公司	133.63
3	202300004	陈军	1976-07	中建二局第三建筑工程有限公司	133.29
4	202300005	樊海瑞	1981-06	中国民生银行股份有限公司	132.46
...	...	...	...	...	...
5998	202305999	曹恰	1983-09	首都师范大学科德学院	109.92
5999	202306000	罗佳	1981-05	厦门方胜众合企业服务有限公司海淀分公司	109.92
6000	202306001	席盛代	1983-06	中国华能集团清洁能源技术研究院有限公司	109.92
6001	202306002	彭芸芸	1981-09	北京汉杰凯德文化传播有限公司	109.92
6002	202306003	张越	1982-01	大爱城投资控股有限公司	109.92

	ename	job	mgr	sal	comm	dno
eno
1359	胡一刀	销售员	3344.0	1800	200.0	30
2056	乔峰	分析师	7800.0	5000	1500.0	20
3088	李莫愁	设计师	2056.0	3500	800.0	20
3211	张无忌	程序员	2056.0	3200	NaN	20
3233	丘处机	程序员	2056.0	3400	NaN	20
3244	欧阳锋	程序员	3088.0	3200	NaN	20
3251	张翠山	程序员	2056.0	4000	NaN	20
3344	黄蓉	销售主管	7800.0	3000	800.0	30
3577	杨过	会计	5566.0	2200	NaN	10
3588	朱九真	会计	5566.0	2500	NaN	10
4466	苗人凤	销售员	3344.0	2500	NaN	30
5234	郭靖	出纳	5566.0	2000	NaN	10
5566	宋远桥	会计师	7800.0	4000	1000.0	10
7800	张三丰	总裁	NaN	9000	1200.0	20

	销售日期	销售区域	销售渠道	销售订单	品牌	售价	销售数量	直接成本	销售代表
1940	44195	北京	京东	201230004052	花花姑娘	269	26	1560	S00344
1941	44195	福建	实体	201230007101	八匹马	79	97	3028	S00677
1942	44196	福建	实体	201231009600	花花姑娘	269	55	2277	S00604
1943	44196	福建	抖音	201231003362	八匹马	59	59	852	S00272
1944	44196	福建	天猫	201231003844	八匹马	99	27	435	S00272

	销售日期	销售区域	销售渠道	销售订单	品牌	售价	销售数量	直接成本
10002	43831	上海	抖音	200101005623	八匹马	219	29	1016
10004	43831	上海	天猫	200101009600	八匹马	169	14	485
10006	43832	上海	京东	200102007683	皮皮虾	799	68	15203
10008	43833	上海	天猫	200103008960	壁虎	239	82	4127

	销售日期	销售区域	销售渠道	销售订单	品牌	售价	销售数量	直接成本
11786	44155	北京	拼多多	201120004262	八匹马	169	72	3254
10063	43840	福建	天猫	200110001110	花花姑娘	399	72	10467
11491	44085	北京	天猫	200911005803	花花姑娘	399	54	8074
11898	44183	上海	天猫	201218009470	壁虎	399	52	7546
10394	43882	上海	天猫	200221007225	皮皮虾	499	19	3981
...	...	...	...	...	...	...	...	...
10059	43840	北京	天猫	200110008666	壁虎	299	14	1495
11065	43993	北京	拼多多	200611003039	花花姑娘	229	27	2314
10166	43852	上海	天猫	200122008599	皮皮虾	499	49	8375
10770	43940	北京	天猫	200419008535	花花姑娘	599	12	2463
11809	44159	广东	拼多多	201124006230	壁虎	269	42	2120

数据分析第4天-Pandas1¶

创建dataframe对象¶

	销售日期	销售区域	销售渠道	销售订单	品牌	售价	销售数量	直接成本
10001	43831	上海	拼多多	200101007627	八匹马	99	83	3351
10002	43831	上海	抖音	200101005623	八匹马	219	29	1016
10003	43831	上海	天猫	200101004554	八匹马	169	85	6320
10004	43831	上海	天猫	200101009600	八匹马	169	14	485
10005	43831	上海	天猫	200101007986	皮皮虾	249	61	2452
...	...	...	...	...	...	...	...	...
11941	44195	北京	京东	201230004052	花花姑娘	269	26	1560
11942	44195	福建	实体	201230007101	八匹马	79	97	3028
11943	44196	福建	实体	201231009600	花花姑娘	269	55	2277
11944	44196	福建	抖音	201231003362	八匹马	59	59	852
11945	44196	福建	天猫	201231003844	八匹马	99	27	435

	company_name	uri	salary	site	year	edu	job_name	city	pos_count
0	中国电信云	https://www.zhipin.com/job_detail/11266fc18dc1...	20-40K·17薪	北京海淀区西山	经验不限	本科	Python	beijing	1
1	奇虎360	https://www.zhipin.com/job_detail/2a3103941dc2...	20-40K·15薪	北京朝阳区酒仙桥	3-5年	大专	Python	beijing	6
2	VIPKID	https://www.zhipin.com/job_detail/2dd7f2760947...	20-40K·14薪	北京朝阳区十里堡	5-10年	本科	Python	beijing	4
3	天阳科技	https://www.zhipin.com/job_detail/a0c8485a448b...	12-24K	北京石景山区八宝山	1-3年	本科	python工程师	beijing	2
4	武汉佰钧成	https://www.zhipin.com/job_detail/d6627bf7c1e2...	12-17K	北京朝阳区三元桥	3-5年	大专	python开发	beijing	3
...	...	...	...	...	...	...	...	...	...
9820	公众智能	https://www.zhipin.com/job_detail/7b9c08dbce81...	8-10K	西安	3-5年	本科	产品经理	xian	2
9821	微感	https://www.zhipin.com/job_detail/c7e99005528f...	8-10K	西安雁塔区紫薇田园都市	3-5年	大专	产品经理	xian	4
9822	巴斯光年	https://www.zhipin.com/job_detail/1045fe64f248...	10-20K	西安雁塔区大雁塔	3-5年	本科	产品经理	xian	6
9823	西大华特科技	https://www.zhipin.com/job_detail/e3c21cc748e7...	5-8K	西安雁塔区唐延路	1-3年	硕士	产品经理（农药）	xian	6
9824	西安纯粹科技	https://www.zhipin.com/job_detail/09965129db3e...	3-6K	西安雁塔区玫瑰大楼	1-3年	本科	产品经理	xian	5

	Verbal	Math	English
1001	61	60	68
1002	81	87	93
1003	89	81	83
1004	81	100	75
1005	91	85	83

	Verbal	Math	English
1001	61	60	68
1002	81	87	93
1003	89	81	83
1004	81	100	75
1005	91	85	83