import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
plt.style.use("ggplot")
df=pd.read_csv("D:/2018_BigData/Python/Python_files_Notebook/theme_practice/student_consumption.csv")
df.head()
|
DealTime |
MonDeal |
bf_StudentID |
AccName |
PerSex |
0 |
2018/7/1 06:32:51 |
-4.5 |
14877 |
张某某 |
女 |
1 |
2018/7/1 11:43:05 |
-11.0 |
14917 |
高某某 |
男 |
2 |
2018/7/1 14:21:42 |
-9.2 |
14921 |
胡某某 |
男 |
3 |
2018/7/1 14:33:06 |
-6.9 |
14898 |
牛某某 |
男 |
4 |
2018/7/1 15:05:45 |
-7.0 |
14917 |
高某某 |
男 |
df["DealTime"]=pd.to_datetime(df.DealTime)
df.head(2)
|
DealTime |
MonDeal |
bf_StudentID |
AccName |
PerSex |
0 |
2018-07-01 06:32:51 |
-4.5 |
14877 |
张某某 |
女 |
1 |
2018-07-01 11:43:05 |
-11.0 |
14917 |
高某某 |
男 |
df["DealTime"]=df["DealTime"].dt.strftime('%Y-%m-%d')
df.head(2)
|
DealTime |
MonDeal |
bf_StudentID |
AccName |
PerSex |
0 |
2018-07-01 |
-4.5 |
14877 |
张某某 |
女 |
1 |
2018-07-01 |
-11.0 |
14917 |
高某某 |
男 |
print(df.head(2))
print(df.tail(2))
DealTime MonDeal bf_StudentID AccName PerSex
0 2018-07-01 -4.5 14877 张某某 女
1 2018-07-01 -11.0 14917 高某某 男
DealTime MonDeal bf_StudentID AccName PerSex
463902 2019-01-27 -3.5 16075 王某某 女
463903 2019-01-27 -7.8 16103 邱某某 女
df.describe()
|
MonDeal |
bf_StudentID |
count |
463904.000000 |
463904.000000 |
mean |
-8.371863 |
15099.442035 |
std |
5.878407 |
718.459919 |
min |
-404.200000 |
13012.000000 |
25% |
-10.900000 |
14459.000000 |
50% |
-8.000000 |
14942.000000 |
75% |
-4.450000 |
15784.000000 |
max |
-0.010000 |
16162.000000 |
DealTime_counts=df['DealTime'].value_counts()
print("\n",DealTime_counts.shape)
print("\n")
print(DealTime_counts.head(6))
print("\n")
print(DealTime_counts.tail(6))
(161,)
2018-11-26 5245
2018-09-04 5182
2018-11-20 5167
2018-11-19 5121
2018-11-29 5116
2018-09-05 5108
Name: DealTime, dtype: int64
2018-08-13 19
2018-08-19 14
2018-10-05 13
2018-11-17 5
2018-09-08 4
2019-01-30 1
Name: DealTime, dtype: int64
df1=df.groupby(['DealTime',"bf_StudentID","AccName","PerSex"])["MonDeal"].agg(["sum","mean","count"])
df1.head(12)
|
|
|
|
sum |
mean |
count |
DealTime |
bf_StudentID |
AccName |
PerSex |
|
|
|
2018-07-01 |
13983 |
裘某某 |
男 |
-3.7 |
-3.70 |
1 |
14018 |
虞某某 |
男 |
-9.5 |
-9.50 |
1 |
14073 |
刘某某 |
男 |
-8.0 |
-8.00 |
1 |
14074 |
周某某 |
男 |
-14.3 |
-7.15 |
2 |
14097 |
毛某某 |
男 |
-10.0 |
-10.00 |
1 |
14099 |
李某某 |
男 |
-10.5 |
-10.50 |
1 |
14139 |
敖某某 |
男 |
-3.5 |
-3.50 |
1 |
14140 |
王某某 |
男 |
-11.5 |
-11.50 |
1 |
14169 |
查某某 |
女 |
-10.0 |
-10.00 |
1 |
14187 |
叶某某 |
男 |
-4.5 |
-4.50 |
1 |
14200 |
王某某 |
男 |
-3.5 |
-3.50 |
1 |
14208 |
方某某 |
男 |
-15.5 |
-7.75 |
2 |
df2=pd.pivot_table(df,index=["DealTime","bf_StudentID","AccName","PerSex"],values=["MonDeal"],aggfunc=[sum,np.mean,len])
df2.head(12)
|
|
|
|
sum |
mean |
len |
|
|
|
|
MonDeal |
MonDeal |
MonDeal |
DealTime |
bf_StudentID |
AccName |
PerSex |
|
|
|
2018-07-01 |
13983 |
裘某某 |
男 |
-3.7 |
-3.70 |
1.0 |
14018 |
虞某某 |
男 |
-9.5 |
-9.50 |
1.0 |
14073 |
刘某某 |
男 |
-8.0 |
-8.00 |
1.0 |
14074 |
周某某 |
男 |
-14.3 |
-7.15 |
2.0 |
14097 |
毛某某 |
男 |
-10.0 |
-10.00 |
1.0 |
14099 |
李某某 |
男 |
-10.5 |
-10.50 |
1.0 |
14139 |
敖某某 |
男 |
-3.5 |
-3.50 |
1.0 |
14140 |
王某某 |
男 |
-11.5 |
-11.50 |
1.0 |
14169 |
查某某 |
女 |
-10.0 |
-10.00 |
1.0 |
14187 |
叶某某 |
男 |
-4.5 |
-4.50 |
1.0 |
14200 |
王某某 |
男 |
-3.5 |
-3.50 |
1.0 |
14208 |
方某某 |
男 |
-15.5 |
-7.75 |
2.0 |
df1.describe()
|
sum |
mean |
count |
count |
164422.000000 |
164422.000000 |
164422.000000 |
mean |
-23.620565 |
-8.702236 |
2.821423 |
std |
14.832558 |
3.876227 |
1.487303 |
min |
-439.250000 |
-170.000000 |
1.000000 |
25% |
-30.150000 |
-10.000000 |
2.000000 |
50% |
-21.000000 |
-8.000000 |
3.000000 |
75% |
-13.200000 |
-6.550000 |
4.000000 |
max |
-0.090000 |
-0.090000 |
14.000000 |
df1.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 164422 entries, (2018-07-01, 13983, 裘某某, 男) to (2019-01-30, 15472, 陈某某, 男)
Data columns (total 3 columns):
sum 164422 non-null float64
mean 164422 non-null float64
count 164422 non-null int64
dtypes: float64(2), int64(1)
memory usage: 4.9+ MB
df2.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 164422 entries, (2018-07-01, 13983, 裘某某, 男) to (2019-01-30, 15472, 陈某某, 男)
Data columns (total 3 columns):
(sum, MonDeal) 164422 non-null float64
(mean, MonDeal) 164422 non-null float64
(len, MonDeal) 164422 non-null float64
dtypes: float64(3)
memory usage: 4.9+ MB
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463904 entries, 0 to 463903
Data columns (total 5 columns):
DealTime 463904 non-null object
MonDeal 463904 non-null float64
bf_StudentID 463904 non-null int64
AccName 463904 non-null object
PerSex 463904 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 17.7+ MB
df["month"]=df.DealTime.values.astype("datetime64[M]")
print(df.head(2))
print(df.tail(2))
DealTime MonDeal bf_StudentID AccName PerSex month
0 2018-07-01 -4.5 14877 张某某 女 2018-07-01
1 2018-07-01 -11.0 14917 高某某 男 2018-07-01
DealTime MonDeal bf_StudentID AccName PerSex month
463902 2019-01-27 -3.5 16075 王某某 女 2019-01-01
463903 2019-01-27 -7.8 16103 邱某某 女 2019-01-01
student_group=df.groupby(["bf_StudentID","AccName","PerSex"]).agg(["sum","mean","count"])
print(student_group.head(3))
print(student_group.tail(3))
MonDeal
sum mean count
bf_StudentID AccName PerSex
13012 张某某 女 -381.1 -7.472549 51
13564 吴某某 男 -3272.8 -8.590026 381
13599 曹某某 男 -1686.8 -9.319337 181
MonDeal
sum mean count
bf_StudentID AccName PerSex
16160 周某某 女 -2135.25 -9.490000 225
16161 韩某某 女 -1519.60 -6.970642 218
16162 陈某某 男 -1754.65 -9.588251 183
type(student_group)
pandas.core.frame.DataFrame
student_group.columns
MultiIndex(levels=[['MonDeal'], ['sum', 'mean', 'count']],
labels=[[0, 0, 0], [0, 1, 2]])
df1.columns
Index(['sum', 'mean', 'count'], dtype='object')
df2.columns
MultiIndex(levels=[['sum', 'mean', 'len'], ['MonDeal']],
labels=[[0, 1, 2], [0, 0, 0]])
df.columns
Index(['DealTime', 'MonDeal', 'bf_StudentID', 'AccName', 'PerSex', 'month'], dtype='object')
student_group0=student_group.reset_index()
print(student_group0.head())
print ('\n'*2)
print(student_group0.columns)
bf_StudentID AccName PerSex MonDeal
sum mean count
0 13012 张某某 女 -381.10 -7.472549 51
1 13564 吴某某 男 -3272.80 -8.590026 381
2 13599 曹某某 男 -1686.80 -9.319337 181
3 13685 毛某某 男 -1854.93 -10.847544 171
4 13947 李某某 女 -3009.40 -7.982493 377
MultiIndex(levels=[['MonDeal', 'PerSex', 'AccName', 'bf_StudentID'], ['sum', 'mean', 'count', '']],
labels=[[3, 2, 1, 0, 0, 0], [3, 3, 3, 0, 1, 2]])
df3=df1.reset_index()
print(df3.head(3))
print(df3.tail(3))
DealTime bf_StudentID AccName PerSex sum mean count
0 2018-07-01 13983 裘某某 男 -3.7 -3.7 1
1 2018-07-01 14018 虞某某 男 -9.5 -9.5 1
2 2018-07-01 14073 刘某某 男 -8.0 -8.0 1
DealTime bf_StudentID AccName PerSex sum mean count
164419 2019-01-27 16149 李某某 女 -4.0 -4.0 1
164420 2019-01-27 16150 陈某某 男 -4.5 -4.5 1
164421 2019-01-30 15472 陈某某 男 -5.8 -5.8 1
df3.columns
Index(['DealTime', 'bf_StudentID', 'AccName', 'PerSex', 'sum', 'mean',
'count'],
dtype='object')
df3.rename(columns={'sum':'MonDeal', 'mean':'avgMonDeal', 'count':'transaction_times'}, inplace = True)
df3.head(5)
|
DealTime |
bf_StudentID |
AccName |
PerSex |
MonDeal |
avgMonDeal |
transaction_times |
0 |
2018-07-01 |
13983 |
裘某某 |
男 |
-3.7 |
-3.70 |
1 |
1 |
2018-07-01 |
14018 |
虞某某 |
男 |
-9.5 |
-9.50 |
1 |
2 |
2018-07-01 |
14073 |
刘某某 |
男 |
-8.0 |
-8.00 |
1 |
3 |
2018-07-01 |
14074 |
周某某 |
男 |
-14.3 |
-7.15 |
2 |
4 |
2018-07-01 |
14097 |
毛某某 |
男 |
-10.0 |
-10.00 |
1 |
df3["month"]=df3.DealTime.values.astype("datetime64[M]")
df3.tail(2)
|
DealTime |
bf_StudentID |
AccName |
PerSex |
MonDeal |
avgMonDeal |
transaction_times |
month |
164420 |
2019-01-27 |
16150 |
陈某某 |
男 |
-4.5 |
-4.5 |
1 |
2019-01-01 |
164421 |
2019-01-30 |
15472 |
陈某某 |
男 |
-5.8 |
-5.8 |
1 |
2019-01-01 |
student_group1=df3.groupby(["bf_StudentID","AccName","PerSex"])["MonDeal"].agg(["sum","mean","count"])
print(student_group1.head(3))
print ('\n'*1)
print(student_group1.columns)
sum mean count
bf_StudentID AccName PerSex
13012 张某某 女 -381.1 -11.909375 32
13564 吴某某 男 -3272.8 -26.393548 124
13599 曹某某 男 -1686.8 -17.755789 95
Index(['sum', 'mean', 'count'], dtype='object')
student_group1.columns
Index(['sum', 'mean', 'count'], dtype='object')
student_group2=student_group1.reset_index()
print(student_group2.head(3))
print ('\n'*1)
print(student_group2.columns)
print ('\n'*1)
print(student_group2.shape)
bf_StudentID AccName PerSex sum mean count
0 13012 张某某 女 -381.1 -11.909375 32
1 13564 吴某某 男 -3272.8 -26.393548 124
2 13599 曹某某 男 -1686.8 -17.755789 95
Index(['bf_StudentID', 'AccName', 'PerSex', 'sum', 'mean', 'count'], dtype='object')
(1730, 6)
student_group2.describe()
|
bf_StudentID |
sum |
mean |
count |
count |
1730.000000 |
1730.000000 |
1730.000000 |
1730.000000 |
mean |
15010.860116 |
-2244.936717 |
-23.293647 |
95.041618 |
std |
710.216323 |
860.371730 |
7.495143 |
18.722765 |
min |
13012.000000 |
-6239.950000 |
-100.000000 |
1.000000 |
25% |
14386.250000 |
-2790.475000 |
-27.545710 |
91.000000 |
50% |
14834.500000 |
-2216.500000 |
-22.793650 |
98.000000 |
75% |
15721.750000 |
-1698.650000 |
-18.324731 |
105.000000 |
max |
16162.000000 |
-9.000000 |
-4.950000 |
146.000000 |
student_group3=student_group2.sort_values(by="sum",ascending=True)
print(student_group3.head(5))
print ('\n'*1)
print(student_group3.tail(5))
bf_StudentID AccName PerSex sum mean count
1132 15556 陶某某 男 -6239.95 -66.382447 94
176 14123 汤某某 男 -5698.20 -53.756604 106
1593 16024 余某某 男 -5125.69 -50.251863 102
1607 16038 王某某 男 -5080.40 -49.807843 102
1113 15537 潘某某 男 -4798.80 -47.047059 102
bf_StudentID AccName PerSex sum mean count
922 14892 林某某 男 -32.7 -16.35 2
12 13956 陆某某 男 -11.0 -11.00 1
581 14546 俞某某 男 -10.0 -10.00 1
412 14363 钱某某 女 -9.5 -9.50 1
22 13967 权某某 女 -9.0 -9.00 1
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=14)
sns.set(font=myfont.get_name())
x=student_group3["count"]
sns.distplot(x)


plt.hist(x, bins=15, color=sns.desaturate("indianred", .9), alpha=.8)
plt.xlabel("在校天数(消费天数)",fontproperties="SimHei",fontsize=15)
plt.ylabel("人数",fontsize=15)
plt.title("2018.7-2019.1学生在校时长分布",fontsize=18)
plt.show()


x1=student_group3["sum"]
x2=student_group3["mean"]
kwargs = dict(histtype='stepfilled',alpha=0.8, bins=15)
plt.hist(x1, **kwargs)
plt.xlabel("各学生总消费金额",fontproperties="SimHei",fontsize=15)
plt.ylabel("人数",fontsize=15)
plt.title("2018.7-2019.1学生总消费情况",fontsize=18)
plt.show()


plt.hist(x2,bins=18)
plt.xlabel("各学生日均消费金额",fontproperties="SimHei",fontsize=15)
plt.ylabel("人数",fontsize=15)
plt.title("2018.7-2019.1学生日均消费情况",fontsize=18)
plt.show()


counts, bin_edges = np.histogram(x2, bins=18)
print(counts)
[ 1 0 0 0 0 0 2 0 2 5 12 51 131 339 508 453 192 34]
df3.groupby("month").MonDeal.sum().plot()
plt.show()


df3.groupby("month").transaction_times.sum().plot()
plt.show()


df3.groupby("bf_StudentID").sum().plot.scatter(x="MonDeal",y="transaction_times")
plt.show()


plt.figure(figsize=(12,4))
plt.subplot(121)
df3.groupby("bf_StudentID").MonDeal.sum().hist(bins=30)
plt.xlabel("学生消费总金额",fontsize=15)
plt.title("2018.7-2019.1学生消费总金额分布",fontsize=18)
plt.subplot(122)
df3.groupby("bf_StudentID").transaction_times.sum().hist(bins=30)
plt.ylabel("学生消费总次数",fontsize=15)
plt.title("2018.7-2019.1学生消费总次数分布",fontsize=18)
plt.show()


df3.groupby("bf_StudentID").month.min().value_counts()
2018-07-01 918
2018-08-01 570
2018-09-01 231
2018-12-01 4
2018-11-01 4
2018-10-01 3
Name: month, dtype: int64
df3.groupby("bf_StudentID").month.max().value_counts()
2019-01-01 1692
2018-12-01 13
2018-10-01 9
2018-11-01 8
2018-09-01 5
2018-07-01 2
2018-08-01 1
Name: month, dtype: int64
pivoted_counts=df3.pivot_table(index="bf_StudentID",columns="month",values="transaction_times",aggfunc="sum").fillna(0)
columns_month=df3.month.sort_values().astype("str").unique()
pivoted_counts.columns=columns_month
pivoted_counts.head()
|
2018-07-01 |
2018-08-01 |
2018-09-01 |
2018-10-01 |
2018-11-01 |
2018-12-01 |
2019-01-01 |
bf_StudentID |
|
|
|
|
|
|
|
13012 |
0.0 |
0.0 |
10.0 |
10.0 |
15.0 |
7.0 |
9.0 |
13564 |
17.0 |
25.0 |
82.0 |
63.0 |
69.0 |
75.0 |
50.0 |
13599 |
8.0 |
10.0 |
39.0 |
33.0 |
34.0 |
31.0 |
26.0 |
13685 |
8.0 |
12.0 |
28.0 |
33.0 |
34.0 |
39.0 |
17.0 |
13947 |
9.0 |
22.0 |
81.0 |
64.0 |
66.0 |
72.0 |
63.0 |
pivoted_counts_transf=pivoted_counts.applymap(lambda x: 1 if x>59 else np.NaN if x==0 else 0)
pivoted_counts_transf.head()
|
2018-07-01 |
2018-08-01 |
2018-09-01 |
2018-10-01 |
2018-11-01 |
2018-12-01 |
2019-01-01 |
bf_StudentID |
|
|
|
|
|
|
|
13012 |
NaN |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
13564 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
0.0 |
13599 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
13685 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
13947 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1.0 |
(pivoted_counts_transf.sum()/pivoted_counts_transf.count()).plot(figsize=(10,4))
plt.xlabel('时间(月)', fontsize=18)
plt.ylabel('百分比(%)', fontsize=18)
plt.title('各月学生爱饭堂率', fontsize=18)
plt.show()


pivoted_MonDeal=df3.pivot_table(
index="bf_StudentID",columns="month",values="MonDeal",aggfunc="mean").fillna(0)
columns_month=df3.month.sort_values().astype("str").unique()
pivoted_MonDeal.columns=columns_month
pivoted_MonDeal.head()
|
2018-07-01 |
2018-08-01 |
2018-09-01 |
2018-10-01 |
2018-11-01 |
2018-12-01 |
2019-01-01 |
bf_StudentID |
|
|
|
|
|
|
|
13012 |
0.000000 |
0.000000 |
-12.720000 |
-11.483333 |
-11.844444 |
-11.016667 |
-12.650000 |
13564 |
-30.080000 |
-20.180000 |
-27.912000 |
-28.309524 |
-27.842500 |
-27.668750 |
-21.442105 |
13599 |
-26.450000 |
-13.333333 |
-20.478947 |
-16.734375 |
-16.116667 |
-16.238235 |
-19.833333 |
13685 |
-26.375000 |
-15.122222 |
-16.628235 |
-20.656250 |
-20.747059 |
-23.277778 |
-28.556250 |
13947 |
-19.566667 |
-15.550000 |
-28.981818 |
-22.380000 |
-29.767500 |
-24.147826 |
-31.069444 |
pivoted_purchase=pivoted_MonDeal.applymap(lambda x: 0 if x==0 else 1)
pivoted_purchase.head()
|
2018-07-01 |
2018-08-01 |
2018-09-01 |
2018-10-01 |
2018-11-01 |
2018-12-01 |
2019-01-01 |
bf_StudentID |
|
|
|
|
|
|
|
13012 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
13564 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
13599 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
13685 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
13947 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
def purchase_return(data):
status = []
for i in range(6):
if data[i] == 1:
if data[i+1] == 1:
status.append(1)
if data[i+1] == 0:
status.append(0)
else:
status.append(np.NaN)
status.append(np.NaN)
return status
pivoted_purchase_return = pivoted_purchase.apply(purchase_return,axis=1)
pivoted_purchase_return.head(5)
bf_StudentID
13012 [nan, nan, 1, 1, 1, 1, nan]
13564 [1, 1, 1, 1, 1, 1, nan]
13599 [1, 1, 1, 1, 1, 1, nan]
13685 [1, 1, 1, 1, 1, 1, nan]
13947 [1, 1, 1, 1, 1, 1, nan]
dtype: object
pivoted_purchase_return_mean = (pivoted_purchase_return.sum()/pivoted_purchase_return.count())
pivoted_purchase_return_mean.plot(figsize=(12,4))
plt.xlabel('时间(月)', fontsize=18)
plt.ylabel('百分比(%)', fontsize=18)
plt.title('各月学生复购率', fontsize=18)
plt.show()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-163-45678b38adb8> in <module>()
1 pivoted_purchase_return_mean = (pivoted_purchase_return.sum()/pivoted_purchase_return.count())
----> 2 pivoted_purchase_return_mean.plot()
3
4 plt.xlabel('时间(月)', fontsize=18)
5 plt.ylabel('百分比(%)', fontsize=18)
AttributeError: 'numpy.ndarray' object has no attribute 'plot'
下篇续:用Python分析用户消费行为 Student Comsumption Analysis ② https://blog.****.net/weixin_44216391/article/details/89329804