An error occurred while calling o266.fit while trying to fit the train data set in sparknlp pipeline
An error occurred while calling o266.fit while trying to fit the train data set in sparknlp pipeline.
Environment: openjdk version "1.8.0_252" Spark NLP version 2.5.0 Apache Spark version: 2.4.4
The data set is taken from https://www.kaggle.com/c/nlp-getting-started from where a train.csv is taken
Code:
import sparknlp
spark = sparknlp.start()
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
trainDataset = spark.read \
.option("header", True) \
.csv("/content/drive/My Drive/Dataset/train.csv")
trainDataset.show(10, truncate=50)
trainDatasetcleaned = trainDataset.drop('keyword').drop('location')
trainDatasetr = trainDatasetcleaned.selectExpr("id as id","text as text","target as out")
document = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document") \
.setCleanupMode("shrink")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")
lemma =LemmatizerModel.pretrained('lemma_antbnc') \
.setInputCols(["cleanTokens"]) \
.setOutputCol("lemma")
word_embeddings = BertEmbeddings\
.pretrained('bert_base_cased', 'en') \
.setInputCols(["document",'lemma'])\
.setOutputCol("embeddings")\
.setPoolingLayer(0)
embeddingsSentence =SentenceEmbeddings() \
.setInputCols(["document","embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")
classifierdl = ClassifierDLApproach() \
.setInputCols(["sentence_embeddings"]) \
.setOutputCol("class") \
.setLabelColumn("out") \
.setMaxEpochs(5) \
.setEnableOutputLogs(True)
clf_pipeline = Pipeline(
stages = [
document,
tokenizer,
normalizer,
stopwords_cleaner,
lemma,
word_embeddings,
embeddingsSentence,
classifierdl
])
clf_pipelineModel = clf_pipeline.fit(trainDatasetr)
Error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-36-9f19ac3525f1> in <module>()
----> 1 clf_pipelineModel = clf_pipeline.fit(trainDatasetr,params=None)
7 frames
/usr/local/lib/python3.6/dist-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/usr/local/lib/python3.6/dist-packages/pyspark/ml/pipeline.py in _fit(self, dataset)
107 dataset = stage.transform(dataset)
108 else: # must be an Estimator
--> 109 model = stage.fit(dataset)
110 transformers.append(model)
111 if i < indexOfLastEstimator:
/usr/local/lib/python3.6/dist-packages/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/usr/local/lib/python3.6/dist-packages/pyspark/ml/wrapper.py in _fit(self, dataset)
293
294 def _fit(self, dataset):
--> 295 java_model = self._fit_java(dataset)
296 model = self._create_model(java_model)
297 return self._copyValues(model)
/usr/local/lib/python3.6/dist-packages/pyspark/ml/wrapper.py in _fit_java(self, dataset)
290 """
291 self._transfer_params_to_java()
--> 292 return self._java_obj.fit(dataset._jdf)
293
294 def _fit(self, dataset):
/usr/local/lib/python3.6/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/usr/local/lib/python3.6/dist-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/lib/python3.6/dist-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o266.fit.
: java.lang.NullPointerException
at com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach$$anonfun$1.apply(ClassifierDLApproach.scala:252)
at com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach$$anonfun$1.apply(ClassifierDLApproach.scala:252)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach.train(ClassifierDLApproach.scala:252)
at com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLApproach.train(ClassifierDLApproach.scala:39)
at com.johnsnowlabs.nlp.AnnotatorApproach._fit(AnnotatorApproach.scala:55)
at com.johnsnowlabs.nlp.AnnotatorApproach.fit(AnnotatorApproach.scala:61)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)