爬取智联招聘数据--数据清洗和可视化
我是选择互联网行业、实习生、全国这几个类别,通过智联招聘得到的数据,上面列出的城市是全国实习生平均月薪最高的前30个。很奇怪,我本来以为肯定会是北上广深杭,结果大多是北京周边城市。
import numpy as npy
import pandas as pdaimport matplotlib.pyplot as plt
import pymysql
import re
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
#注意如果只是对某一列进行操作,只需要把要操作列取出来,否则操作较慢
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8")
sql = "select * from zhaopin;"
dataframe = pda.read_sql(sql, conn)
# dataframe.info()
# 建立索引
dataframe.index = dataframe['Id']
del (dataframe['Id'])
dataframe_sort = dataframe.sort_index()
dataframe = dataframe_sort
dataframe.info()
dataframe[['gzdd', 'zwyx']].head(10)
# 对职位月薪这一列进行数据清洗,将职位月薪区间形式转变成平均月薪,便于操作。
dataframe['bottom'] = dataframe['top'] = dataframe['average'] = dataframe["zwyx"]
pat = '([0-9]+)'
c1 = c2 = c3 = c4 = 0
for i in range(len(dataframe['zwyx'])):
#iloc是进行切片,strip不带参数,是对其首尾取出空格
item = dataframe['zwyx'].iloc[i].strip()
result = re.compile(pat).findall(item)
try:
if result:
try:
#下面这句执行成功,说明是一个区间:4000-6000,所以取两个
dataframe['bottom'].iloc[i], dataframe['top'].iloc[i] = result[0], result[1]
dataframe['average'].iloc[i] = str((int(result[0]) + int(result[1])) / 2)
c1 += 1 # 类似:4000-6000
print(c1)
except:
#只有一个数据
dataframe['bottom'].iloc[i] = dataframe['top'].iloc[i] = result[0]
dataframe['average'].iloc[i] = str(int(result[0]))
c2 += 1 # 类似:4000以下
else:
dataframe['bottom'].iloc[i] = dataframe['top'].iloc[i] = dataframe['average'].iloc[i] = item
c3 += 1
except Exception as e:
c4 += 1
print(c4, item, repr(e))
pat= re.compile('([0-9]+)')
aver = []
#对于非数字的处理,变成NAN,空值不进行处理,不影响统计
for i in range(len(dataframe.average)):
item = dataframe.average.iloc[i].strip()
result = re.findall(pat,item)
try:
if result:
aver.append(float(result[0]))
elif (item.strip()=='面议'):
aver.append(np.nan)
else:
print(item)
except Exception as e:
print(item,type(item),repr(e))
dataframe['re_average']=aver
num_aver=len(aver)
print(num_aver)
#将平均月薪按工作地点进行分组
grouped=dataframe['re_average'].groupby(dataframe['gzdd'])
s=pda.Series(data={'average':dataframe['re_average'].mean()})
res=grouped.mean().append(s)
#将平均值加入
#进行降序,并且小数点后只取一位
res.sort_values(ascending=False).round(1)
matplotlib.style.use('ggplot')
#设置画布,相当于背景
fig3=plt.figure(3,facecolor='#458B74')
#增加子图
ax3=fig3.add_subplot(1,1,1,facecolor='#66CDAA',alpha=0.3)
#对热门的30个城市画条形图
res.sort_values(ascending=False).round(1)[0:30].plot(kind='barh',rot=0)
#分别加标题,横纵坐标的标签
title=plt.title("城市-平均月薪分布图",fontsize=18,color='#F0FFFF')
xlabel=plt.xlabel('平均月薪',fontsize=14,color='#F0FFFF')
ylabel=plt.ylabel('城市',fontsize=14,color='#F0FFFF')
label3=res.sort_values(ascending=False).round(1)[0:30]
text=plt.text(27500,6.05,'月薪样本数:15406',fontsize=16,color='#FFE4E1')
plt.tick_params(colors='#F0F8FF')#设置坐标刻度颜色
for i in range(len(label3)):
plt.text(label3[i],i-0.3,str(label3[i]),color='#F0F8FF')
#设置条形图上方的数据
plt.show()
这是我在知乎上看到来自Lyon智联python相关职位的数据分析及可视化-pandas&Matplotlib篇,之后学习的。
类似的还可以做出:
import numpy as npy
import pandas as pda
import matplotlib.pyplot as plt
import pymysql
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
#上面是为了能够输出中文
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8")sql = "select gsgm,zprs,gzdd from zhaopin;"
dataframe = pda.read_sql(sql, conn)
df1=pda.DataFrame(data={'zprs':dataframe['zprs'],'gsgm':dataframe['gsgm'],'gzdd':dataframe['gzdd']})
grouped1 = df1['zprs'].groupby([df1['gsgm'],df1.gzdd])
xlist=list(dataframe.gzdd.value_counts().index[0:10])
#将招聘人数按照公司规模和工作城市进行分组
matplotlib.style.use('dark_background')
fig1 = plt.figure(7,facecolor = '#458B74')
ax1 = fig1.add_subplot(1,1,1,facecolor='#66CDAA',alpha=0.3)
title = plt.title('招聘人数-公司规模-城市分布图',fontsize = 18,color = '#F0FFFF')
xlabel = plt.xlabel('城市',fontsize = 14,color = '#F0FFFF')
ylabel = plt.ylabel('招聘人数',fontsize = 14,color = '#F0FFFF')
plt.tick_params(colors='#F0F8FF')
ylist1 = grouped1.mean().round(0)['20人以下'].reindex(xlist).values
ylist2 = grouped1.mean().round(0)['20-99人'].reindex(xlist).values
ylist3 = grouped1.mean().round(0)['100-499人'].reindex(xlist).values
ylist4 = grouped1.mean().round(0)['500-999人'].reindex(xlist).values
ylist5 = grouped1.mean().round(0)['1000-9999人'].reindex(xlist).values
ylist6 = grouped1.mean().round(0)['10000人以上'].reindex(xlist).values
ylist7 = grouped1.mean().round(0)['保密'].reindex(xlist).values
#.mean()将分组后的得到的数据进行求平均值,并且.round(0)不取小数点,毕竟是人数,.reindex(xlist)是按照xlist来进行重新排序,并且将缺失值补充成NAN空值
xwidth = npy.arange(10)
width=0.1
img1 = ax1.bar(xwidth,ylist1,width)
img2 = ax1.bar(xwidth+width,ylist2,width)
img3 = ax1.bar(xwidth+width*2,ylist3,width)
img4 = ax1.bar(xwidth+width*3,ylist4,width)
img5 = ax1.bar(xwidth+width*4,ylist5,width)
img6 = ax1.bar(xwidth+width*5,ylist6,width)
img7 = ax1.bar(xwidth+width*6,ylist7,width)
ax1.set_xticklabels(xlist)
ax1.set_xticks(xwidth + width /2)
#设置x轴文本和宽度
ax1.legend((img1,img2,img3,img4,img5,img6,img7), ('20人以下','20-99人','100-499人','500-999人','1000-9999人','10000人以上','保密'),fontsize=13,facecolor='#F0F8FF')
#设置图例
#设置栅格
plt.grid(True)
plt.show()
注意:ax1.bar(xwidth+width*4,ylist5,width),bar函数里前两个参数都是一个数组(序列),而且长度一定要相同,否则会出现这个错误: