Python只保存一行数据
问题描述:
def get_user_data(self,start_url):
html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
all_user = selector.xpath('//div[contains(@class,"c") and contains(@id,"M")]')
for i in all_user:
user_id = i.xpath('./div[1]/a[@class="nk"]/@href')[0]
content = i.xpath('./div[1]/span[1]')[0]
contents = content.xpath('string(.)')
times = i.xpath('./div/span[@class="ct"]/text()')[0]
if len(i.xpath('./div[3]')):
imgages = i.xpath('./div[2]/a/img/@src')
praise_num = i.xpath('./div[3]/a[2]/text()')
transmit_num = i.xpath('./div[3]/a[3]/text()')
elif len(i.xpath('./div[2]')):
imgages = i.xpath('./div[2]/a/img/@src')
praise_num = i.xpath('./div[2]/a[3]/text()')
transmit_num = i.xpath('./div[2]/a[4]/text()')
else :
imgages = ''
praise_num = i.xpath('./div[1]/a[2]/text()')
transmit_num = i.xpath('./div[1]/a[3]/text()')
try:
if re.search('from',times.encode().decode('utf-8')):
month_day, time, device = times.split(maxsplit=2)
self.data['mobile_phone'] = device
else:
time,device = times.split(maxsplit=1)
self.data['month_day'] = ''
self.data['create_time'] = month_day + ' ' + time
except Exception as e:
print('failure:',e)
self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.data['user_id'] = user_id
self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','')
self.data['imgages'] = imgages
self.data['praise_num'] = praise_num
self.data['transmit_num'] = transmit_num
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
我试图抓住每一页数据并将其保存到data.But我写错了,因为我只在'a.txt'的每个页面上保存了一块数据,那么我该如何编写才能在'a.txt'中正确保存每一页数据?Python只保存一行数据
答
写操作外的for循环这就是为什么它只是将最后一次迭代数据保存到文件
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
答
你在循环的每次迭代中覆盖的self.data
各种值。
相反,self.data
应该是一个列表。您应该在每次迭代中创建一个新字典,并在末尾将其附加到数据。
self.data = []
for i in all_user:
values = {}
...
values['crawl_time'] = ...
values['user_id'] = ...
...
self.data.append(values)