python大规模机器学习day4-关注实例顺序

关注实例顺序

实验要求:
1.使用shuffle获得更适合在线随机学习的最优随机顺序
实验内容:
1.使用shuffle函数
2.使用zlib包读和存压缩数据
代码注释:
import zlib #zlib包提供数据压缩
from random import shuffle #random包可以产生随机数,shuffle的作用类似于洗牌
def ram_shuffle(filename_in,filename_out,header=True): #定义了3个参数,作用分别是读文件,写文件,布尔值参数。
with open(filename_in,‘rb’) as f: #rb用于读取二进制文件
zlines = [zlib.compress(line,9) for line in f] #compress用于删除字符串中的特定字符
if header:
first_row = zlines.pop(0) #pop用于提取一个数并返回该值
shuffle(zlines) #洗牌
with open(filename_out,‘wb’) as f:
if header:
f.write(zlib.decompress(first_row))
for zline in zlines:
f.write(zlib.decompress(zline))
import os
local_path=os.getcwd() #os.getcwd用于返回当前工作目录
source = ‘bikesharing\hour.csv’
ram_shuffle(filename_in=local_path+’\’+source,filename_out=local_path+’\bikesharing\shuffled_hour.csv’,header=True)

代码2:
from random import shuffle
import pandas as pd
import numpy as np
import os
def disk_shuffle(filename_in,filename_out,header=True,iterations =3,CHUNK_SIZE=2500,SEP=’,’):
for i in range(iterations):
with open(filename_in,‘rb’) as R:
iterator = pd.read_csv(R,chunksize=CHUNK_SIZE) #设置迭代器的参数,文件路径和组块
for n,df in enumerate(iterator): #分了块之后的迭代器,参数就是n和数据组块
if n==0 and header:
header_cols=SEP.join(df.columns)+’\n’
df.iloc[np.random.permutation(len(df))].to_csv(str(n)+’_chunk.csv’,index=False,header=False,sep=SEP) #permutation用于随机排列一个序列或数组,to_csv保存在当前路径下,header参数为0代表不保存列名,sep参数为‘,’代表分割符,index参数为0代表不保存行索引,iloc用于取行列中的数据,即切块,用【】来控制
ordering=list(range(0,n+1))
shuffle(ordering) #打乱后的索引
with open(filename_out,‘w’) as W:
if header:
W.write(header_cols)
for f in ordering:
with open(str(f)+’_chunk.csv’,‘r’) as R:
for line in R:
W.write(line)
os.remove(str(f)+’_chunk.csv’)
filename_in =filename_out
CHUNK_SIZE = int(CHUNK_SIZE/2)
import os
local_path=os.getcwd()
source=‘bikesharing\hour.csv’
disk_shuffle(filename_in=local_path+’\’+source,filename_out=local_path+’\bikesharing\shuffled_hour.csv’,header=True)

运行截图:
python大规模机器学习day4-关注实例顺序
文件中新创建了一个shuffled_hour文件
python大规模机器学习day4-关注实例顺序
可见,实例的顺序已经被打乱
python大规模机器学习day4-关注实例顺序
在代码进行一半时,先创建6个分块的已经重新洗牌的文件,之后会以随机顺序合成一个大的文件,然后删除这些分块文件
python大规模机器学习day4-关注实例顺序python大规模机器学习day4-关注实例顺序python大规模机器学习day4-关注实例顺序
整合好的文件
源代码
import zlib
from random import shuffle
def ram_shuffle(filename_in,filename_out,header=True):
with open(filename_in,‘rb’) as f:
zlines = [zlib.compress(line,9) for line in f]
if header:
first_row = zlines.pop(0)
shuffle(zlines)
with open(filename_out,‘wb’) as f:
if header:
f.write(zlib.decompress(first_row))
for zline in zlines:
f.write(zlib.decompress(zline))
import os
local_path=os.getcwd()
source = ‘bikesharing\hour.csv’
ram_shuffle(filename_in=local_path+’\’+source,filename_out=local_path+’\bikesharing\shuffled_hour.csv’,header=True)

代码2:
from random import shuffle
import pandas as pd
import numpy as np
import os
def disk_shuffle(filename_in,filename_out,header=True,iterations =3,CHUNK_SIZE=2500,SEP=’,’):
for i in range(iterations):
with open(filename_in,‘rb’) as R:
iterator = pd.read_csv(R,chunksize=CHUNK_SIZE)
for n,df in enumerate(iterator):
if n==0 and header:
header_cols=SEP.join(df.columns)+’\n’
df.iloc[np.random.permutation(len(df))].to_csv(str(n)+’_chunk.csv’,index=False,header=False,sep=SEP)
ordering=list(range(0,n+1))
shuffle(ordering)
with open(filename_out,‘w’) as W:
if header:
W.write(header_cols)
for f in ordering:
with open(str(f)+’_chunk.csv’,‘r’) as R:
for line in R:
W.write(line)
os.remove(str(f)+’_chunk.csv’)
filename_in =filename_out
CHUNK_SIZE = int(CHUNK_SIZE/2)
import os
local_path=os.getcwd()
source=‘bikesharing\hour.csv’
disk_shuffle(filename_in=local_path+’\’+source,filename_out=local_path+’\bikesharing\shuffled_hour.csv’,header=True)

实验总结:
用了两种方法来对文件打乱顺序,一种是将整个文件打乱顺序,第二种是将文件分成几个小文件,打乱小文件的顺序后再随机顺序合成。