Извлечение API Adobe PDF из Databricks
Я пытаюсь настроить пример кода, используемый для извлечения данных в формате PDF, с помощью python sdk для службы Adobe PDF API в среде блоков данных. Этот кластер имеет только один узел драйвера. Но я столкнулся с проблемой при доступе к файлам конфигурации, загруженным в папку DBFS.
Пожалуйста, дайте мне знать, как мне решить проблему.
Вот фрагмент кода
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.table_structure_type import TableStructureType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
#logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
credentials = Credentials.service_account_credentials_builder() \
.from_file("/dbfs/FileStore/pdfservices_api_credentials.json") \
.build()
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()
source = FileRef.create_from_local_file("/dbfs/FileStore/form.pdf")
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
.with_element_to_extract_renditions(ExtractRenditionsElementType.TABLES) \
.with_table_structure_format(TableStructureType.CSV) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
result.save_as(base_path + "/output/ExtractTextInfoFromPDF.zip")
Вот подробности ошибки:
INFO:adobe.pdfservices.operation.pdfops.extract_pdf_operation:All validations successfully done. Beginning ExtractPDF operation execution
INFO:py4j.java_gateway:Received command c on object id p0
INFO:py4j.java_gateway:Received command c on object id p0
INFO:py4j.java_gateway:Received command c on object id p0
INFO:py4j.java_gateway:Received command c on object id p0
SdkException: description =Exception in fetching access token, requestTrackingId=(<class 'AttributeError'>, AttributeError("'str' object has no attribute 'get'"), <traceback object at 0x7f7572a3fd00>)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-150d96ca-003d-4671-a6d9-ab8e566616d1/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/auth/jwt_authenticator.py in refresh_token(self)
62 data=access_token_request_payload, headers={})
---> 63 response = http_client.process_request(http_request=http_request, success_status_codes=[HTTPStatus.OK],
64 error_response_handler=self.handle_ims_failure)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-150d96ca-003d-4671-a6d9-ab8e566616d1/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py in process_request(http_request, success_status_codes, error_response_handler)
37 response = _execute_request(http_request)
---> 38 if _handle_response_and_retry(response, success_status_codes,
39 error_response_handler, not http_request.authenticator, http_request.request_key) and http_request.retryable:
/local_disk0/.ephemeral_nfs/envs/pythonEnv-150d96ca-003d-4671-a6d9-ab8e566616d1/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py in _handle_response_and_retry(response, success_status_codes, error_response_handler, is_ims_api, request_key)
94 "Failure response code {error_code} encountered from backend".format(error_code=response.status_code))
---> 95 should_retry = ResponseUtil.handle_api_failures(response, request_key, is_ims_api)
96 return should_retry if should_retry else error_response_handler(response)