通过链接解码Json

通过链接解码Json

问题描述:

所需的json包含土耳其字符。我使用python 3.5作为编程语言。从url中读取json我使用了两种不同的方法,并得到不同的错误。 首先我以前读书的URL方法和使用下面的代码:通过链接解码Json

import pprint 
from urllib.request import urlopen 
from bs4 import BeautifulSoup 

url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=6784eb9d038057a0821a7c905fd5f263&config[item_category]=Ym9yc2E=&config[item_title]=QUtCTks=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvYWtibmstYWtiYW5rLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=5a8cadfa04b533f95ae83f0b9e530091&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1" 
html = urlopen(url) 
#print(html.read()) 

data = html.read() 
soup = BeautifulSoup(data.decode('utf-8'),"lxml") 

print(soup) 

但是我得到的输出输出与未解码的字母。例如,\ u00f6 \ u0131 输出如下:

<html><body><p>mynetComment.render({"config": 

{"service":"finanspano","moderation":"1","item_alias":"6784eb9d038057a0821a7c905fd5f263","item_category":"Ym9yc2E=","item_title":"QUtCTks=","item_url":"aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvYWtibmstYWtiYW5rLw==","profile":"0","share_email":"1","share_fb":"1","share_tw":"1","profile_pattern":"Iw==","pagination":"1","pagination_pattern":"aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA\/cGFnZT17UEFHRX0=","comment_per_page":"5","page":"2","reply_count":"2","title":"yorumlar","hash":"5a8cadfa04b533f95ae83f0b9e530091"},"data":{"mynetUsername":null,"ordering":"desc","orderBy":"c.created","items":[{"id":"4037034","parent_id":"0","child":"0","item_id":"448","comment":"para\u015f\u00fctlerinizi tak\u0131n her ihtimale kar\u015f\u0131","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-30 11:45:36","user":"sekmentx_2014","clike":"2","cdislike":"0","ip":"1372981766","clikeTotal":"2","ctotal":"2","timeDiff":"2541843","like":"+2","timePast":"4 hafta \u00f6nce"},{"id":"4034275","parent_id":"0","child":"0","item_id":"448","comment":"a\u015fa\u011f\u0131lardan almas\u0131n\u0131 bilene yukar\u0131dan satmas\u0131n\u0131 bilene g\u00fczel ortamlar olu\u015fuyor","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 15:45:37","user":"sekmentx_2014","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2613842","like":"+1","timePast":"1 ay \u00f6nce"},{"id":"4033970","parent_id":"0","child":"0","item_id":"448","comment":"kar cebe yak\u0131\u015f\u0131r ak\u0131ll\u0131 olanlara","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 14:58:55","user":"sekmentx_2014","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2616644","like":"+1","timePast":"1 ay \u00f6nce"},{"id":"4032505","parent_id":"0","child":"0","item_id":"448","comment":"en g\u00fczeli satmak nazlana nazlana \u00e7\u0131k\u0131yor ne dersiniz i\u015flem hacimleri iyice d\u00fc\u015ft\u00fc","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 11:04:45","user":"erdal_1972_pknez","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2630694","like":"+1","timePast":"1 ay \u00f6nce"},{"id":"4023515","parent_id":"0","child":"0","item_id":"448","comment":"Akbank \u00e7\u00f6ken sistemi ile iyi bir zarar edecek bug\u00fcn \u00f6yle g\u00f6r\u00fcn\u00fcyor. yaz\u0131klar olsun bu devirde bilgi i\u015flem sistemin \u00e7\u00f6k\u00fcyor yahu.","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-22 15:36:53","user":"ekin_yildirim_2015","clike":"1","cdislike":"0","ip":"3578451480","clikeTotal":"1","ctotal":"1","timeDiff":"3219166","like":"+1","timePast":"1 ay \u00f6nce"}],"total":"908","totalPage":182}});</p></body></html> 

其次我用下面的方法

import urllib.request, json 

url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=f89e64e27edc887b8ed3314fe8562eb2&config[item_category]=Ym9yc2E=&config[item_title]=R0FSQU4=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvZ2FyYW4tZ2FyYW50aS1iYW5rYXNpLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=e80cdd0e7a3dd9f4bbc393517386781c&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1" 
data = json.loads(urllib.request.urlopen(url).read().decode('utf-8')) 
print(data) 

我收到以下错误:

Traceback (most recent call last): 
    File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 17, in <module> 
    data = json.loads(urllib.request.urlopen(url).read().decode('utf-8')) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\json\__init__.py", line 319, in loads 
    return _default_decoder.decode(s) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\json\decoder.py", line 339, in decode 
    obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\json\decoder.py", line 357, in raw_decode 
    raise JSONDecodeError("Expecting value", s, err.value) from None 
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) 
+0

你想要接收的数据不是它应该是的JSON格式。首先查看打印数据。 –

+0

我打印的数据和输出现在包含 –

由于输出网页被提供不是必需的JSON。
我们需要从文本中提取出JSON转换答案可能是专门为这个问题使用re来提取文本中的JSON类型信息。这里

import urllib.request, json, re 

url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=f89e64e27edc887b8ed3314fe8562eb2&config[item_category]=Ym9yc2E=&config[item_title]=R0FSQU4=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvZ2FyYW4tZ2FyYW50aS1iYW5rYXNpLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=e80cdd0e7a3dd9f4bbc393517386781c&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1" 
data = urllib.request.urlopen(url).read().decode('utf-8') 
json_type_string = re.findall('({.*})',data)[0] 
json_data = json.loads(json_type_string) 
print(json_data) 

正则表达式基本上拉出第一{开口支架和最后}右括号之间的信息。

+0

第6行返回以下错误:TypeError:JSON对象必须是str,而不是'list' –

+0

@NihadAzimli my bad!我只是纠正它,因为're.findall'返回一个列表,所以你只需要选择它的唯一元素。 –