Читайте s3a паркетный раздел от s3 как pandas df - python
Я пытаюсь прочитать конкретный паркетный раздел из s3 в Python, получая разные ошибки в каждом подходе. Может кто-нибудь указать мне, что здесь не так.
Сначала я проверил, существует ли файл, да, он есть. Также я проверил aws s3api list-object --bucket my-bucket
чтобы увидеть, если у меня есть раздел записи, он перечисляет все, так что нет проблем с доступом.
Может быть потому, что я написал как s3a
файл? Любой другой подход для чтения файлов S3A?
Вот как паркетные перегородки пишутся скриптом pyspark,
data_processed.repartition("date").write.mode("append").partitionBy("date").parquet("s3a://my-bucket/data-source/data-report/")
И файлы пишутся как,
my-bucket/data-source/data-report/date=2018-07-20/part-***.snappy.parquet
my-bucket/data-source/data-report/date=2018-07-21/part-***.snappy.parquet
my-bucket/data-source/data-report/date=2018-07-22/part-***.snappy.parquet
my-bucket/data-source/data-report/date=2018-07-23/part-***.snappy.parquet
Я пытаюсь прочитать только один раздел,
Сначала попробовал это,
def dataread_s3_parquet(filepath):
fs = s3fs.S3FileSystem(anon=False)
with fs.open(filepath) as f:
data = pd.read_parquet(f,engine ='fastparquet')
return data
parqfilepath = 'my-bucket/data-source/data-report/date=2018-07-20/part-***.snappy.parquet'
test_par = dataread_s3_parquet(parqfilepath)
Ошибка -
305 abs_prefix = ''
306 if path and path[0]:
--> 307 if path[0][0] == '/':
308 abs_prefix = '/'
309 path = list(path)
TypeError: 'S3File' object does not support indexing
Тогда попробовал это,
import s3fs
import pyarrow.parquet as pq
from pyarrow.filesystem import S3FSWrapper
fs = s3fs.S3FileSystem()
bucket = "my-bucket"
path = "data-source/data-report/date=2018-07-20/part-***.snappy.parquet"
p_dataset = pq.ParquetDataset("s3://{0}/{1}".format(bucket, path),filesystem=fs)
df = p_dataset.read().to_pandas()
Ошибка -
/lib/python2.7/site-packages/pyarrow/parquet.pyc in validate_schemas(self)
752 self.schema = open_file(self.common_metadata_path).schema
753 else:
--> 754 self.schema = self.pieces[0].get_metadata(open_file).schema
755 elif self.schema is None:
756 self.schema = self.metadata.schema
IndexError: list index out of range
Тогда попробовал это,
import boto3
import io
import pandas as pd
buffer = io.BytesIO()
s3 = boto3.resource('s3')
object = s3.Object('my-bucket','data-source/data-report/date=2018-07-20/part-****.snappy.parquet')
object.download_fileobj(buffer)
df = pd.read_parquet(buffer)
Ошибка -
lib/python2.7/site-packages/s3transfer/futures.pyc in result(self)
231 # final result.
232 if self._exception:
--> 233 raise self._exception
234 return self._result
235
ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
Тогда попробовал это,
import boto3
import pandas as pd
client = boto3.client('s3')
obj = client.get_object(Bucket = 'my-bucket',Key = 'data-source/data-report/date=2018-07-20/part-***.snappy.parquet')
grid_sizes = pd.read_parquet(obj['Body'])
Ошибка -
---> 10 obj = client.get_object(Bucket = 'my-bucket',Key = 'data-source/data-report/date=2018-07-20/part-***.snappy.parquet')
11 grid_sizes = pd.read_parquet(obj['Body'])
/Users/myname/anaconda/lib/python2.7/site-packages/botocore/client.pyc in _api_call(self, *args, **kwargs)
251 "%s() only accepts keyword arguments." % py_operation_name)
252 # The "self" in this scope is referring to the BaseClient.
--> 253 return self._make_api_call(operation_name, kwargs)
254
255 _api_call.__name__ = str(py_operation_name)
/Users/myname/anaconda/lib/python2.7/site-packages/botocore/client.pyc in _make_api_call(self, operation_name, api_params)
555 error_code = parsed_response.get("Error", {}).get("Code")
556 error_class = self.exceptions.from_code(error_code)
--> 557 raise error_class(parsed_response, operation_name)
558 else:
559 return parsed_response
ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
Пробовал это,
import s3fs
import fastparquet as fp
s3 = s3fs.S3FileSystem()
fs = s3fs.core.S3FileSystem()
s3_path = "my-bucket/data-source/data-report/date=2018-07-20/part-**.snappy.parquet"
all_paths_from_s3 = fs.glob(path=s3_path)
myopen = s3.open
#use s3fs as the filesystem
fp_obj = fp.ParquetFile(all_paths_from_s3,open_with=myopen)
#convert to pandas dataframe
df = fp_obj.to_pandas()
Ошибка -
---------------------------------------------------------------------------
ClientError Traceback (most recent call last)
<ipython-input-31-9c8c3dd2667b> in <module>()
1 myopen = s3.open
2 #use s3fs as the filesystem
----> 3 fp_obj = fp.ParquetFile(all_paths_from_s3,open_with=myopen)
4 #convert to pandas dataframe
5 df = fp_obj.to_pandas()
/Users/myname/anaconda/lib/python2.7/site-packages/fastparquet/api.pyc in __init__(self, fn, verify, open_with, root, sep)
81 if isinstance(fn, (tuple, list)):
82 basepath, fmd = metadata_from_many(fn, verify_schema=verify,
---> 83 open_with=open_with, root=root)
84 if basepath:
85 self.fn = join_path(basepath, '_metadata') # effective file
/Users/myname/anaconda/lib/python2.7/site-packages/fastparquet/util.pyc in metadata_from_many(file_list, verify_schema, open_with, root)
117 file_list = [pf.fn for pf in pfs]
118 elif all(not isinstance(pf, api.ParquetFile) for pf in file_list):
--> 119 pfs = [api.ParquetFile(fn, open_with=open_with) for fn in file_list]
120 else:
121 raise ValueError("Merge requires all PaquetFile instances or none")
/Users/myname/anaconda/lib/python2.7/site-packages/fastparquet/api.pyc in __init__(self, fn, verify, open_with, root, sep)
98 self.fn = join_path(fn)
99 with open_with(fn, 'rb') as f:
--> 100 self._parse_header(f, verify)
101 self.open = open_with
102 self.sep = sep
/Users/myname/anaconda/lib/python2.7/site-packages/fastparquet/api.pyc in _parse_header(self, f, verify)
108 assert f.read(4) == b'PAR1'
109 f.seek(-8, 2)
--> 110 head_size = struct.unpack('<i', f.read(4))[0]
111 if verify:
112 assert f.read() == b'PAR1'
/Users/myname/anaconda/lib/python2.7/site-packages/s3fs/core.pyc in read(self, length)
1014 if self.closed:
1015 raise ValueError('I/O operation on closed file.')
-> 1016 self._fetch(self.loc, self.loc + length)
1017 out = self.cache[self.loc - self.start:
1018 self.loc - self.start + length]
/Users/myname/anaconda/lib/python2.7/site-packages/s3fs/core.pyc in _fetch(self, start, end)
978 self.end = end + self.blocksize
979 self.cache = _fetch_range(self.s3.s3, self.bucket, self.key,
--> 980 start, self.end, req_kw=self.s3.req_kw)
981 if start < self.start:
982 if not self.fill_cache and end + self.blocksize < self.start:
/Users/myname/anaconda/lib/python2.7/site-packages/s3fs/core.pyc in _fetch_range(client, bucket, key, start, end, max_attempts, req_kw)
1169 resp = client.get_object(Bucket=bucket, Key=key,
1170 Range='bytes=%i-%i' % (start, end - 1),
-> 1171 **req_kw)
1172 return resp['Body'].read()
1173 except S3_RETRYABLE_ERRORS as e:
/Users/myname/anaconda/lib/python2.7/site-packages/botocore/client.pyc in _api_call(self, *args, **kwargs)
251 "%s() only accepts keyword arguments." % py_operation_name)
252 # The "self" in this scope is referring to the BaseClient.
--> 253 return self._make_api_call(operation_name, kwargs)
254
255 _api_call.__name__ = str(py_operation_name)
/Users/myname/anaconda/lib/python2.7/site-packages/botocore/client.pyc in _make_api_call(self, operation_name, api_params)
555 error_code = parsed_response.get("Error", {}).get("Code")
556 error_class = self.exceptions.from_code(error_code)
--> 557 raise error_class(parsed_response, operation_name)
558 else:
559 return parsed_response
ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
РЕДАКТИРОВАТЬ: После некоторого исследования я заметил, что у меня нет файлов метаданных для паркета только потому, что я пишу с использованием s3a. Есть ли способ прочитать эти файлы?