爬取智联招聘数据--数据清洗和可视化

我是选择互联网行业、实习生、全国这几个类别，通过智联招聘得到的数据，上面列出的城市是全国实习生平均月薪最高的前30个。很奇怪，我本来以为肯定会是北上广深杭，结果大多是北京周边城市。

import numpy as npy

import pandas as pda
import matplotlib.pyplot as plt
import pymysql
import re
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

#注意如果只是对某一列进行操作，只需要把要操作列取出来，否则操作较慢
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8")
sql = "select * from zhaopin;"
dataframe = pda.read_sql(sql, conn)
# dataframe.info()
# 建立索引
dataframe.index = dataframe['Id']
del (dataframe['Id'])
dataframe_sort = dataframe.sort_index()
dataframe = dataframe_sort
dataframe.info()
dataframe[['gzdd', 'zwyx']].head(10)
# 对职位月薪这一列进行数据清洗，将职位月薪区间形式转变成平均月薪，便于操作。
dataframe['bottom'] = dataframe['top'] = dataframe['average'] = dataframe["zwyx"]
pat = '([0-9]+)'
c1 = c2 = c3 = c4 = 0
for i in range(len(dataframe['zwyx'])):
#iloc是进行切片，strip不带参数，是对其首尾取出空格
item = dataframe['zwyx'].iloc[i].strip()
result = re.compile(pat).findall(item)

try:
if result:
try:
#下面这句执行成功，说明是一个区间：4000-6000,所以取两个
dataframe['bottom'].iloc[i], dataframe['top'].iloc[i] = result[0], result[1]
dataframe['average'].iloc[i] = str((int(result[0]) + int(result[1])) / 2)
c1 += 1 # 类似：4000-6000
print(c1)
except:

#只有一个数据
dataframe['bottom'].iloc[i] = dataframe['top'].iloc[i] = result[0]
dataframe['average'].iloc[i] = str(int(result[0]))
c2 += 1 # 类似：4000以下
else:
dataframe['bottom'].iloc[i] = dataframe['top'].iloc[i] = dataframe['average'].iloc[i] = item
c3 += 1

except Exception as e:
c4 += 1
print(c4, item, repr(e))
pat= re.compile('([0-9]+)')
aver = []

#对于非数字的处理，变成NAN，空值不进行处理，不影响统计

for i in range(len(dataframe.average)):
item = dataframe.average.iloc[i].strip()
result = re.findall(pat,item)
try:
if result:
aver.append(float(result[0]))
elif (item.strip()=='面议'):
aver.append(np.nan)
else:
print(item)
except Exception as e:
print(item,type(item),repr(e))
dataframe['re_average']=aver
num_aver=len(aver)
print(num_aver)

#将平均月薪按工作地点进行分组

grouped=dataframe['re_average'].groupby(dataframe['gzdd'])
s=pda.Series(data={'average':dataframe['re_average'].mean()})
res=grouped.mean().append(s)

#将平均值加入

#进行降序，并且小数点后只取一位

res.sort_values(ascending=False).round(1)
matplotlib.style.use('ggplot')

#设置画布，相当于背景

fig3=plt.figure(3,facecolor='#458B74')

#增加子图

ax3=fig3.add_subplot(1,1,1,facecolor='#66CDAA',alpha=0.3)

#对热门的30个城市画条形图

res.sort_values(ascending=False).round(1)[0:30].plot(kind='barh',rot=0)

#分别加标题，横纵坐标的标签

title=plt.title("城市-平均月薪分布图",fontsize=18,color='#F0FFFF')
xlabel=plt.xlabel('平均月薪',fontsize=14,color='#F0FFFF')
ylabel=plt.ylabel('城市',fontsize=14,color='#F0FFFF')

label3=res.sort_values(ascending=False).round(1)[0:30]
text=plt.text(27500,6.05,'月薪样本数:15406',fontsize=16,color='#FFE4E1')
plt.tick_params(colors='#F0F8FF')#设置坐标刻度颜色
for i in range(len(label3)):
plt.text(label3[i],i-0.3,str(label3[i]),color='#F0F8FF')
#设置条形图上方的数据

plt.show()

这是我在知乎上看到来自Lyon智联python相关职位的数据分析及可视化-pandas&Matplotlib篇，之后学习的。

类似的还可以做出：

爬取智联招聘数据--数据清洗和可视化

import numpy as npy
import pandas as pda
import matplotlib.pyplot as plt
import pymysql
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']

mpl.rcParams['axes.unicode_minus'] = False

#上面是为了能够输出中文

conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8")
sql = "select gsgm,zprs,gzdd from zhaopin;"
dataframe = pda.read_sql(sql, conn)
df1=pda.DataFrame(data={'zprs':dataframe['zprs'],'gsgm':dataframe['gsgm'],'gzdd':dataframe['gzdd']})
grouped1 = df1['zprs'].groupby([df1['gsgm'],df1.gzdd])
xlist=list(dataframe.gzdd.value_counts().index[0:10])
#将招聘人数按照公司规模和工作城市进行分组

matplotlib.style.use('dark_background')
fig1 = plt.figure(7,facecolor = '#458B74')
ax1 = fig1.add_subplot(1,1,1,facecolor='#66CDAA',alpha=0.3)
title = plt.title('招聘人数-公司规模-城市分布图',fontsize = 18,color = '#F0FFFF')
xlabel = plt.xlabel('城市',fontsize = 14,color = '#F0FFFF')
ylabel = plt.ylabel('招聘人数',fontsize = 14,color = '#F0FFFF')
plt.tick_params(colors='#F0F8FF')

ylist1 = grouped1.mean().round(0)['20人以下'].reindex(xlist).values
ylist2 = grouped1.mean().round(0)['20-99人'].reindex(xlist).values
ylist3 = grouped1.mean().round(0)['100-499人'].reindex(xlist).values
ylist4 = grouped1.mean().round(0)['500-999人'].reindex(xlist).values
ylist5 = grouped1.mean().round(0)['1000-9999人'].reindex(xlist).values
ylist6 = grouped1.mean().round(0)['10000人以上'].reindex(xlist).values
ylist7 = grouped1.mean().round(0)['保密'].reindex(xlist).values
#.mean()将分组后的得到的数据进行求平均值，并且.round(0)不取小数点，毕竟是人数，.reindex(xlist)是按照xlist来进行重新排序，并且将缺失值补充成NAN空值

xwidth = npy.arange(10)
width=0.1
img1 = ax1.bar(xwidth,ylist1,width)
img2 = ax1.bar(xwidth+width,ylist2,width)
img3 = ax1.bar(xwidth+width*2,ylist3,width)
img4 = ax1.bar(xwidth+width*3,ylist4,width)
img5 = ax1.bar(xwidth+width*4,ylist5,width)
img6 = ax1.bar(xwidth+width*5,ylist6,width)
img7 = ax1.bar(xwidth+width*6,ylist7,width)
ax1.set_xticklabels(xlist)
ax1.set_xticks(xwidth + width /2)
#设置x轴文本和宽度

ax1.legend((img1,img2,img3,img4,img5,img6,img7), ('20人以下','20-99人','100-499人','500-999人','1000-9999人','10000人以上','保密'),fontsize=13,facecolor='#F0F8FF')
#设置图例

#设置栅格
plt.grid(True)
plt.show()

注意：ax1.bar(xwidth+width*4,ylist5,width)，bar函数里前两个参数都是一个数组（序列）,而且长度一定要相同，否则会出现这个错误：

爬取智联招聘数据--数据清洗和可视化

爬取智联招聘数据--数据清洗和可视化

相关推荐