Расшифровка Json по ссылке
Желаемый JSON содержит турецкие символы. Я использую Python 3.5 в качестве языка программирования. Чтобы прочитать JSON из URL, я использовал два разных метода, и я получаю разные ошибки. Сначала я использовал чтение в качестве метода URL и следующий фрагмент кода:
import pprint
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=6784eb9d038057a0821a7c905fd5f263&config[item_category]=Ym9yc2E=&config[item_title]=QUtCTks=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvYWtibmstYWtiYW5rLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=5a8cadfa04b533f95ae83f0b9e530091&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1"
html = urlopen(url)
#print(html.read())
data = html.read()
soup = BeautifulSoup(data.decode('utf-8'),"lxml")
print(soup)
Однако я получаю вывод с не декодированными буквами. Например, вывод \u00f6 \u0131 следующий:
<html><body><p>mynetComment.render({"config":
{"service":"finanspano","moderation":"1","item_alias":"6784eb9d038057a0821a7c905fd5f263","item_category":"Ym9yc2E=","item_title":"QUtCTks=","item_url":"aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvYWtibmstYWtiYW5rLw==","profile":"0","share_email":"1","share_fb":"1","share_tw":"1","profile_pattern":"Iw==","pagination":"1","pagination_pattern":"aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA\/cGFnZT17UEFHRX0=","comment_per_page":"5","page":"2","reply_count":"2","title":"yorumlar","hash":"5a8cadfa04b533f95ae83f0b9e530091"},"data":{"mynetUsername":null,"ordering":"desc","orderBy":"c.created","items":[{"id":"4037034","parent_id":"0","child":"0","item_id":"448","comment":"para\u015f\u00fctlerinizi tak\u0131n her ihtimale kar\u015f\u0131","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-30 11:45:36","user":"sekmentx_2014","clike":"2","cdislike":"0","ip":"1372981766","clikeTotal":"2","ctotal":"2","timeDiff":"2541843","like":"+2","timePast":"4 hafta \u00f6nce"},{"id":"4034275","parent_id":"0","child":"0","item_id":"448","comment":"a\u015fa\u011f\u0131lardan almas\u0131n\u0131 bilene yukar\u0131dan satmas\u0131n\u0131 bilene g\u00fczel ortamlar olu\u015fuyor","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 15:45:37","user":"sekmentx_2014","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2613842","like":"+1","timePast":"1 ay \u00f6nce"},{"id":"4033970","parent_id":"0","child":"0","item_id":"448","comment":"kar cebe yak\u0131\u015f\u0131r ak\u0131ll\u0131 olanlara","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 14:58:55","user":"sekmentx_2014","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2616644","like":"+1","timePast":"1 ay \u00f6nce"},{"id":"4032505","parent_id":"0","child":"0","item_id":"448","comment":"en g\u00fczeli satmak nazlana nazlana \u00e7\u0131k\u0131yor ne dersiniz i\u015flem hacimleri iyice d\u00fc\u015ft\u00fc","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 11:04:45","user":"erdal_1972_pknez","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2630694","like":"+1","timePast":"1 ay \u00f6nce"},{"id":"4023515","parent_id":"0","child":"0","item_id":"448","comment":"Akbank \u00e7\u00f6ken sistemi ile iyi bir zarar edecek bug\u00fcn \u00f6yle g\u00f6r\u00fcn\u00fcyor. yaz\u0131klar olsun bu devirde bilgi i\u015flem sistemin \u00e7\u00f6k\u00fcyor yahu.","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-22 15:36:53","user":"ekin_yildirim_2015","clike":"1","cdislike":"0","ip":"3578451480","clikeTotal":"1","ctotal":"1","timeDiff":"3219166","like":"+1","timePast":"1 ay \u00f6nce"}],"total":"908","totalPage":182}});</p></body></html>
Во-вторых, я использовал следующий метод
import urllib.request, json
url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=f89e64e27edc887b8ed3314fe8562eb2&config[item_category]=Ym9yc2E=&config[item_title]=R0FSQU4=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvZ2FyYW4tZ2FyYW50aS1iYW5rYXNpLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=e80cdd0e7a3dd9f4bbc393517386781c&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1"
data = json.loads(urllib.request.urlopen(url).read().decode('utf-8'))
print(data)
Я получаю следующую ошибку:
Traceback (most recent call last):
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 17, in <module>
data = json.loads(urllib.request.urlopen(url).read().decode('utf-8'))
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
1 ответ
Поскольку выходная веб-страница предлагает не JSON, который требуется.
Нам нужно извлечь JSON из текста, поэтому ответ на этот вопрос может быть специально для этой проблемы - использовать re для извлечения информации о типе JSON в тексте.
import urllib.request, json, re
url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=f89e64e27edc887b8ed3314fe8562eb2&config[item_category]=Ym9yc2E=&config[item_title]=R0FSQU4=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvZ2FyYW4tZ2FyYW50aS1iYW5rYXNpLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=e80cdd0e7a3dd9f4bbc393517386781c&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1"
data = urllib.request.urlopen(url).read().decode('utf-8')
json_type_string = re.findall('({.*})',data)[0]
json_data = json.loads(json_type_string)
print(json_data)
Regex здесь по сути вытаскивает информацию между первым {
открывающая скобка и последний }
закрывающая скобка