python生成训练集和测试集,原数据以txt给出
源数据:
第二列为id号;
将以txt存储的图片分成train和test集,train和test比例按每个id的总图片数的7:3,实现代码如下所示:
id_a = [] #存放所有的标签号
id_f = [] #存放图片数大于等于20的的标签号
id_num =[]
path = ''
save_train_path = ''
save_val_path = ''
with codecs.open(path,'r') as f:
lines = f.readlines()
for line in lines:
id_a.append(line.split(' ')[1])
for x in range(1,16241):#标签号为1-16240
if id_a.count(str(x)) > 19:
id_f.append(str(x))
id_num.append(int(id_a.count(str(x)) * 0.7))
bb = {id_f[x]:id_num[x] for x in range(num_id)} #记录放入训练集中每个id的图片个数
cnt = {id_f[x]:0 for x in range(num_id)}#记录每个id已访问到的个数
with codecs.open(save_train_path,'w') as f_train:
with codecs.open(save_val_path,'w') as f_val:
with codecs.open(path,'r') as f:
lines = f.readlines()
for line in lines:
kk = line.split(' ')
len_kk = len(kk)
if kk[1] in id_f:
print('x:' + str(cnt[kk[1]]))
cnt[kk[1]] = cnt[kk[1]] + 1
print(cnt[kk[1]])
if cnt[kk[1]] <=bb[kk[1]]:
for j in range(len_kk):
f_train.write(kk[j] + ' ')
else:
for j in range(len_kk):
f_val.write(kk[j] + ' ')