Python spark: IndexError: индекс кортежа вне диапазона
Я работаю на spark
а также python
,
Когда я призываю любое действие на csv
файл, это дает мне IndexError: tuple index out of range
вот фрагмент кода.
test_rdd = sc.textFile("/mapr/data/airflow-test.csv").map(lambda line: line.split())
print(test_rdd.first())
print(test_rdd.count())
Вот полный Traceback.
File "pysparktask.py", line 15, in <module>
print(test_rdd.first())
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/rdd.py", line 1328, in first
rs = self.take(1)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/rdd.py", line 1310, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/context.py", line 933, in runJob
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/rdd.py", line 2403, in _jrdd
self._jrdd_deserializer, profiler)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/rdd.py", line 2336, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/rdd.py", line 2315, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/serializers.py", line 428, in dumps
return cloudpickle.dumps(obj, 2)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 657, in dumps
cp.dump(obj)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 107, in dump
return Pickler.dump(self, obj)
File "/anaconda3/lib/python3.6/pickle.py", line 409, in dump
self.save(obj)
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/anaconda3/lib/python3.6/pickle.py", line 751, in save_tuple
save(element)
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 204, in save_function
self.save_function_tuple(obj)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 241, in save_function_tuple
save((code, closure, base_globals))
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/anaconda3/lib/python3.6/pickle.py", line 736, in save_tuple
save(element)
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/anaconda3/lib/python3.6/pickle.py", line 781, in save_list
self._batch_appends(obj)
File "/anaconda3/lib/python3.6/pickle.py", line 805, in _batch_appends
save(x)
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 204, in save_function
self.save_function_tuple(obj)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 241, in save_function_tuple
save((code, closure, base_globals))
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/anaconda3/lib/python3.6/pickle.py", line 736, in save_tuple
save(element)
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/anaconda3/lib/python3.6/pickle.py", line 781, in save_list
self._batch_appends(obj)
File "/anaconda3/lib/python3.6/pickle.py", line 808, in _batch_appends
save(tmp[0])
File "/anaconda3/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 204, in save_function
self.save_function_tuple(obj)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 234, in save_function_tuple
code, f_globals, defaults, closure, dct, base_globals = self.extract_func_data(func)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 296, in extract_func_data
func_global_refs = self.extract_code_globals(code)
File "/opt/mapr/spark/spark-2.0.1/python/pyspark/cloudpickle.py", line 278, in extract_code_globals
out_names.add(names[oparg])
IndexError: tuple index out of range
какие-либо предложения?
1 ответ
Решение
Вы используете неподдерживаемую версию Python. На сегодняшний день ни одна выпущенная версия, включая Spark 2.0.1, не поддерживает Python 3.6.
Вы должны понизить версию Python 3.5 или патч до https://github.com/apache/spark/pull/17374
Подробное описание проблемы можно найти в SPARK-19019.