Некоторые поля не имеют значения при преобразовании результата Google Cloud в json с помощью Protobuf
Когда я пытаюсь преобразовать вывод сообщений из API Google Cloud Natural Language в json с использованием protobuf, поля чувств и значений оказываются совсем без значений. Если я не использую его и просто распечатаю, у них будут значения. Как это можно исправить? Я попробовал это с обоими json_results = MessageToJson(result, preserving_proto_field_name=True)
а также json_results = MessageToJson(result)
и я не уверен, почему это явление происходит.
Вот пример результирующего файла:
$ cat 10.json
{
"entities": [
{
"name": "Professor",
"type": "PERSON",
"salience": 0.47092151641845703,
"mentions": [
{
"text": {
"content": "Professor",
"begin_offset": 47
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Did U of California Fire Tenured Riverside",
"type": "ORGANIZATION",
"salience": 0.2889040410518646,
"mentions": [
{
"text": {
"content": "Did U of California Fire Tenured Riverside",
"begin_offset": 4
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Ted Cruz",
"type": "PERSON",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Ted_Cruz",
"mid": "/m/07j6ty"
},
"salience": 0.1294257491827011,
"mentions": [
{
"text": {
"content": "Ted Cruz",
"begin_offset": 60
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "some_url",
"type": "OTHER",
"salience": 0.0676858201622963,
"mentions": [
{
"text": {
"content": "some_url",
"begin_offset": 92
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Higher Ed",
"type": "OTHER",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Higher_education",
"mid": "/m/03r55"
},
"salience": 0.043062858283519745,
"mentions": [
{
"text": {
"content": "Higher Ed",
"begin_offset": 73
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
}
],
"language": "en"
}
Вот код:
# copyright 2016 Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This application demonstrates how to perform basic operations with the
Google Cloud Natural Language API
For more information, the documentation at
https://cloud.google.com/natural-language/docs.
"""
import argparse
import sys
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six
import json
from google.protobuf.json_format import MessageToDict, MessageToJson
# [START def_sentiment_text]
def sentiment_text(text):
"""Detects sentiment in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
# [START migration_document_text]
# [START migration_analyze_sentiment]
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# [END migration_document_text]
# Detects sentiment in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
sentiment = client.analyze_sentiment(document).document_sentiment
print('Score: {}'.format(sentiment.score))
print('Magnitude: {}'.format(sentiment.magnitude))
# [END migration_analyze_sentiment]
# [END def_sentiment_text]
# [START def_sentiment_file]
def sentiment_file(gcs_uri):
"""Detects sentiment in the file located in Google Cloud Storage."""
client = language.LanguageServiceClient()
# Instantiates a plain text document.
# [START migration_document_gcs_uri]
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# [END migration_document_gcs_uri]
# Detects sentiment in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
sentiment = client.analyze_sentiment(document).document_sentiment
print('Score: {}'.format(sentiment.score))
print('Magnitude: {}'.format(sentiment.magnitude))
# [END def_sentiment_file]
# [START def_entities_text]
def entities_text(text):
"""Detects entities in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
# [START migration_analyze_entities]
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# Detects entities in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
entities = client.analyze_entities(document).entities
# entity types from enums.Entity.Type
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
for entity in entities:
print('=' * 20)
print(u'{:<16}: {}'.format('name', entity.name))
print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
print(u'{:<16}: {}'.format('metadata', entity.metadata))
print(u'{:<16}: {}'.format('salience', entity.salience))
print(u'{:<16}: {}'.format('wikipedia_url',
entity.metadata.get('wikipedia_url', '-')))
# [END migration_analyze_entities]
# [END def_entities_text]
# [START def_entities_file]
def entities_file(gcs_uri):
"""Detects entities in the file located in Google Cloud Storage."""
client = language.LanguageServiceClient()
# Instantiates a plain text document.
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# Detects sentiment in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
entities = client.analyze_entities(document).entities
# entity types from enums.Entity.Type
entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION',
'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER')
for entity in entities:
print('=' * 20)
print(u'{:<16}: {}'.format('name', entity.name))
print(u'{:<16}: {}'.format('type', entity_type[entity.type]))
print(u'{:<16}: {}'.format('metadata', entity.metadata))
print(u'{:<16}: {}'.format('salience', entity.salience))
print(u'{:<16}: {}'.format('wikipedia_url',
entity.metadata.get('wikipedia_url', '-')))
# [END def_entities_file]
# [START def_syntax_text]
def syntax_text(text):
"""Detects syntax in the text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
# Instantiates a plain text document.
# [START migration_analyze_syntax]
document = types.Document(
content=text,
type=enums.Document.Type.PLAIN_TEXT)
# Detects syntax in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
tokens = client.analyze_syntax(document).tokens
# part-of-speech tags from enums.PartOfSpeech.Tag
pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')
for token in tokens:
print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
token.text.content))
# [END migration_analyze_syntax]
# [END def_syntax_text]
# [START def_syntax_file]
def syntax_file(gcs_uri):
"""Detects syntax in the file located in Google Cloud Storage."""
client = language.LanguageServiceClient()
# Instantiates a plain text document.
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# Detects syntax in the document. You can also analyze HTML with:
# document.type == enums.Document.Type.HTML
tokens = client.analyze_syntax(document).tokens
# part-of-speech tags from enums.PartOfSpeech.Tag
pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')
for token in tokens:
print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag],
token.text.content))
# [END def_syntax_file]
# [START def_entity_sentiment_text]
def entity_sentiment_text(text, line_number):
"""Detects entity sentiment in the provided text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
document = types.Document(
content=text.encode('utf-8'),
type=enums.Document.Type.PLAIN_TEXT)
# Detect and send native Python encoding to receive correct word offsets.
encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
encoding = enums.EncodingType.UTF16
result = client.analyze_entity_sentiment(document, encoding)
json_results = MessageToJson(result, preserving_proto_field_name=True)
print(type(result))
'''
for entity in result.entities:
print('Mentions: ')
print(u'Name: "{}"'.format(entity.name))
for mention in entity.mentions:
print(u' Begin Offset : {}'.format(mention.text.begin_offset))
print(u' Content : {}'.format(mention.text.content))
print(u' Magnitude : {}'.format(mention.sentiment.magnitude))
print(u' Sentiment : {}'.format(mention.sentiment.score))
print(u' Type : {}'.format(mention.type))
print(u'Salience: {}'.format(entity.salience))
print(u'Sentiment: {}\n'.format(entity.sentiment))
'''
print(result)
file_name = str(line_number)+".json"
with open(file_name, 'w') as outfile:
outfile.write(json_results)
#json.dump(result, outfile)
#for entity in result.entities:
# for mention in entity.mentions:
# [END def_entity_sentiment_text]
def entity_sentiment_file(gcs_uri):
"""Detects entity sentiment in a Google Cloud Storage file."""
client = language.LanguageServiceClient()
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
# Detect and send native Python encoding to receive correct word offsets.
encoding = enums.EncodingType.UTF32
if sys.maxunicode == 65535:
encoding = enums.EncodingType.UTF16
result = client.analyze_entity_sentiment(document, encoding)
for entity in result.entities:
print(u'Name: "{}"'.format(entity.name))
for mention in entity.mentions:
print(u' Begin Offset : {}'.format(mention.text.begin_offset))
print(u' Content : {}'.format(mention.text.content))
print(u' Magnitude : {}'.format(mention.sentiment.magnitude))
print(u' Sentiment : {}'.format(mention.sentiment.score))
print(u' Type : {}'.format(mention.type))
print(u'Salience: {}'.format(entity.salience))
print(u'Sentiment: {}\n'.format(entity.sentiment))
# [START def_classify_text]
def classify_text(text):
"""Classifies content categories of the provided text."""
client = language.LanguageServiceClient()
if isinstance(text, six.binary_type):
text = text.decode('utf-8')
document = types.Document(
content=text.encode('utf-8'),
type=enums.Document.Type.PLAIN_TEXT)
categories = client.classify_text(document).categories
for category in categories:
print(u'=' * 20)
print(u'{:<16}: {}'.format('name', category.name))
print(u'{:<16}: {}'.format('confidence', category.confidence))
# [END def_classify_text]
# [START def_classify_file]
def classify_file(gcs_uri):
"""Classifies content categories of the text in a Google Cloud Storage
file.
"""
client = language.LanguageServiceClient()
document = types.Document(
gcs_content_uri=gcs_uri,
type=enums.Document.Type.PLAIN_TEXT)
categories = client.classify_text(document).categories
for category in categories:
print(u'=' * 20)
print(u'{:<16}: {}'.format('name', category.name))
print(u'{:<16}: {}'.format('confidence', category.confidence))
# [END def_classify_file]
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(dest='command')
classify_text_parser = subparsers.add_parser(
'classify-text', help=classify_text.__doc__)
classify_text_parser.add_argument('text')
classify_text_parser = subparsers.add_parser(
'classify-file', help=classify_file.__doc__)
classify_text_parser.add_argument('gcs_uri')
sentiment_entities_text_parser = subparsers.add_parser(
'sentiment-entities-text', help=entity_sentiment_text.__doc__)
sentiment_entities_text_parser.add_argument('text')
#added by mona
sentiment_entities_text_parser.add_argument('line_number')
sentiment_entities_file_parser = subparsers.add_parser(
'sentiment-entities-file', help=entity_sentiment_file.__doc__)
sentiment_entities_file_parser.add_argument('gcs_uri')
sentiment_text_parser = subparsers.add_parser(
'sentiment-text', help=sentiment_text.__doc__)
sentiment_text_parser.add_argument('text')
sentiment_file_parser = subparsers.add_parser(
'sentiment-file', help=sentiment_file.__doc__)
sentiment_file_parser.add_argument('gcs_uri')
entities_text_parser = subparsers.add_parser(
'entities-text', help=entities_text.__doc__)
entities_text_parser.add_argument('text')
entities_file_parser = subparsers.add_parser(
'entities-file', help=entities_file.__doc__)
entities_file_parser.add_argument('gcs_uri')
syntax_text_parser = subparsers.add_parser(
'syntax-text', help=syntax_text.__doc__)
syntax_text_parser.add_argument('text')
syntax_file_parser = subparsers.add_parser(
'syntax-file', help=syntax_file.__doc__)
syntax_file_parser.add_argument('gcs_uri')
args = parser.parse_args()
if args.command == 'sentiment-text':
sentiment_text(args.text)
elif args.command == 'sentiment-file':
sentiment_file(args.gcs_uri)
elif args.command == 'entities-text':
entities_text(args.text)
elif args.command == 'entities-file':
entities_file(args.gcs_uri)
elif args.command == 'syntax-text':
syntax_text(args.text)
elif args.command == 'syntax-file':
syntax_file(args.gcs_uri)
elif args.command == 'sentiment-entities-text':
entity_sentiment_text(args.text, args.line_number)
elif args.command == 'sentiment-entities-file':
entity_sentiment_file(args.gcs_uri)
elif args.command == 'classify-text':
classify_text(args.text)
elif args.command == 'classify-file':
classify_file(args.gcs_uri)
Вот скрипт, который я использую для запуска кода:
#!/bin/bash
n=1
while read -u 3 -r line; do
echo $n "${line::30}"
python entity_sentiment.py sentiment-entities-text "$line" $n
((n++))
done 3< 10tweets.txt
а потом
bash -x runjob.sh
также, 10tweets.txt это:
$ cat 10tweets.txt
Trump on the other hand goes all in on water boarding AND some. #GOPDebate
RT @wpjenna Donald Trump promises that he will not touch the 2nd amendment -- "unless we're going to make it stronger."
Trump 23%, Rubio 19%, Kasich & Bush 14%, Christie 10%, Cruz 9% #NHPrimary
@realDonaldTrump Thank you for saying you won't use vulger language anymore. Talk about Sanders & Clinton. Take Cruz as VP. Mexican votes!!!
RT @SurfPHX Mr. Trump @realDonaldTrump tweeted 25 minutes ago. You all do realize, that our future President hardly sleeps. He's a Fighter and a Worker!
go, Bernie #DemDebate
Sanders calls out Clinton on taking Foreign Policy advice from Warmonger Henry Kissinger some_url via @YouTube
Cruz, Rubio, and the Moral Bankruptcy of Progressive Identity Politics some_url via @NRO
RT @scarylawyerguy "Who does Bernie Sanders listen to on foreign policy." - A question Hillary had to raise b/c the media will not. #DemDebate
Why Did U of California Fire Tenured Riverside Professor? / Ted Cruz and Higher Ed -- ... - some_url
Если я просто напечатаю результаты, он покажет величину и настроение, как показано ниже:
$ cat 10.json
Mentions:
Name: "Professor"
Begin Offset : 47
Content : Professor
Magnitude : 0.0
Sentiment : 0.0
Type : 2
Salience: 0.47092151641845703
Sentiment:
Mentions:
Name: "Did U of California Fire Tenured Riverside"
Begin Offset : 4
Content : Did U of California Fire Tenured Riverside
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.2889040410518646
Sentiment:
Mentions:
Name: "Ted Cruz"
Begin Offset : 60
Content : Ted Cruz
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.1294257491827011
Sentiment:
Mentions:
Name: "some_url"
Begin Offset : 92
Content : some_url
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.0676858201622963
Sentiment:
Mentions:
Name: "Higher Ed"
Begin Offset : 73
Content : Higher Ed
Magnitude : 0.0
Sentiment : 0.0
Type : 1
Salience: 0.043062858283519745
Sentiment:
По сути, поле sentiment пустое, поэтому нет способа извлечь из него оценку и величину, как я делал это ранее, используя print:
print(u' Magnitude : {}'.format(mention.sentiment.magnitude))
print(u' Sentiment : {}'.format(mention.sentiment.score))
2 ответа
Когда числовые значения равны нулю, они не получают поля в json. https://github.com/gogo/protobuf/issues/218 похоже, пока что нет исправления. Но вы можете проверить, не существуют ли эти поля, это означает, что их значения равны нулю. Вот пример, в котором некоторые из значений / величин равны нулю, следовательно, не существуют в файле json, а остальные существуют:
{
"entities": [
{
"name": "RT @scarylawyerguy",
"type": "OTHER",
"salience": 0.4150770902633667,
"mentions": [
{
"text": {
"content": "RT @scarylawyerguy"
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "foreign policy",
"type": "OTHER",
"salience": 0.19249163568019867,
"mentions": [
{
"text": {
"content": "foreign policy",
"begin_offset": 57
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Bernie Sanders",
"type": "PERSON",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Bernie_Sanders",
"mid": "/m/01_gbv"
},
"salience": 0.13153041899204254,
"mentions": [
{
"text": {
"content": "Bernie Sanders",
"begin_offset": 29
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "question",
"type": "OTHER",
"salience": 0.08613643795251846,
"mentions": [
{
"text": {
"content": "question",
"begin_offset": 78
},
"type": "COMMON",
"sentiment": {
"magnitude": 0.10000000149011612,
"score": -0.10000000149011612
}
}
],
"sentiment": {
"magnitude": 0.10000000149011612,
"score": -0.10000000149011612
}
},
{
"name": "media",
"type": "OTHER",
"salience": 0.0647100880742073,
"mentions": [
{
"text": {
"content": "media",
"begin_offset": 116
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "Hillary",
"type": "PERSON",
"metadata": {
"wikipedia_url": "https://en.wikipedia.org/wiki/Hillary_Clinton",
"mid": "/m/0d06m5"
},
"salience": 0.054773446172475815,
"mentions": [
{
"text": {
"content": "Hillary",
"begin_offset": 87
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "b/c",
"type": "OTHER",
"salience": 0.028641967102885246,
"mentions": [
{
"text": {
"content": "b/c",
"begin_offset": 108
},
"type": "COMMON",
"sentiment": {}
}
],
"sentiment": {}
},
{
"name": "DemDebate",
"type": "OTHER",
"salience": 0.026638930663466454,
"mentions": [
{
"text": {
"content": "DemDebate",
"begin_offset": 133
},
"type": "PROPER",
"sentiment": {}
}
],
"sentiment": {}
}
],
"language": "en"
}
и исходный результат для этого:
<class 'google.cloud.language_v1.types.AnalyzeEntitySentimentResponse'>
entities {
name: "RT @scarylawyerguy"
type: OTHER
salience: 0.4150770902633667
mentions {
text {
content: "RT @scarylawyerguy"
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
entities {
name: "foreign policy"
type: OTHER
salience: 0.19249163568019867
mentions {
text {
content: "foreign policy"
begin_offset: 57
}
type: COMMON
sentiment {
}
}
sentiment {
}
}
entities {
name: "Bernie Sanders"
type: PERSON
metadata {
key: "mid"
value: "/m/01_gbv"
}
metadata {
key: "wikipedia_url"
value: "https://en.wikipedia.org/wiki/Bernie_Sanders"
}
salience: 0.13153041899204254
mentions {
text {
content: "Bernie Sanders"
begin_offset: 29
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
entities {
name: "question"
type: OTHER
salience: 0.08613643795251846
mentions {
text {
content: "question"
begin_offset: 78
}
type: COMMON
sentiment {
magnitude: 0.10000000149011612
score: -0.10000000149011612
}
}
sentiment {
magnitude: 0.10000000149011612
score: -0.10000000149011612
}
}
entities {
name: "media"
type: OTHER
salience: 0.0647100880742073
mentions {
text {
content: "media"
begin_offset: 116
}
type: COMMON
sentiment {
}
}
sentiment {
}
}
entities {
name: "Hillary"
type: PERSON
metadata {
key: "mid"
value: "/m/0d06m5"
}
metadata {
key: "wikipedia_url"
value: "https://en.wikipedia.org/wiki/Hillary_Clinton"
}
salience: 0.054773446172475815
mentions {
text {
content: "Hillary"
begin_offset: 87
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
entities {
name: "b/c"
type: OTHER
salience: 0.028641967102885246
mentions {
text {
content: "b/c"
begin_offset: 108
}
type: COMMON
sentiment {
}
}
sentiment {
}
}
entities {
name: "DemDebate"
type: OTHER
salience: 0.026638930663466454
mentions {
text {
content: "DemDebate"
begin_offset: 133
}
type: PROPER
sentiment {
}
}
sentiment {
}
}
language: "en"
Как и другие комментаторы, это ожидаемое поведение. Вы можете изменить его, если хотите, установив including_default_value_fields
параметр для True
, В частности, измените это:
json_results = MessageToJson(result, preserving_proto_field_name=True)
к этому:
json_results = MessageToJson(
message=result,
preserving_proto_field_name=True,
including_default_value_fields=True,
)