I am very new in Spark (specifically, have just started with learning), and I have encountered a recursion error in a very simple code.
Background:
Spark Version 3.5.7
Java Version 11.0.29 (Eclipse Adoptium)
Python 3.14
All are installed on a local machine (i.e. Spark is run in "local mode").
And I have the following code in my Jupyter Notebook.
#cell1
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
#cell2
spark = SparkSession.builder.appName("Test").getOrCreate()
#cell3. Test1 (runs great, shows top 5 from myRange)
myRange = spark.range(1000).toDF("number")
myRange.show(5)
#cell4. Test2 (throws a recursion error)
myManualSchema = StructType([
StructField("some", StringType(), True),
StructField("col", StringType(), True),
StructField("names", LongType(), False)
])
myRow=Row("hello", None, 1)
df = spark.createDataFrame([myRow], myManualSchema)
Basically, creating a simple spark DataFrame throws an error:
PicklingError Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = spark.createDataFrame([myRow], myManualSchema)
File C:\spark\spark_env\Lib\site-packages\pyspark\sql\session.py:1443, in SparkSession.createDataFrame(self, data, schema, samplingRatio, verifySchema)
1438 if has_pandas and isinstance(data, pd.DataFrame):
1439 # Create a DataFrame from pandas DataFrame.
1440 return super(SparkSession, self).createDataFrame( # type: ignore[call-overload]
1441 data, schema, samplingRatio, verifySchema
1442 )
-> 1443 return self._create_dataframe(
1444 data, schema, samplingRatio, verifySchema # type: ignore[arg-type]
1445 )
File C:\spark\spark_env\Lib\site-packages\pyspark\sql\session.py:1487, in SparkSession._create_dataframe(self, data, schema, samplingRatio, verifySchema)
1485 rdd, struct = self._createFromLocal(map(prepare, data), schema)
1486 assert self._jvm is not None
-> 1487 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
1488 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), struct.json())
1489 df = DataFrame(jdf, self)
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:4918, in RDD._to_java_object_rdd(self)
4915 rdd = self._pickled()
4916 assert self.ctx._jvm is not None
-> 4918 return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True)
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5470, in PipelinedRDD._jrdd(self)
5467 else:
5468 profiler = None
-> 5470 wrapped_func = _wrap_function(
5471 self.ctx, self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer, profiler
5472 )
5474 assert self.ctx._jvm is not None
5475 python_rdd = self.ctx._jvm.PythonRDD(
5476 self._prev_jrdd.rdd(), wrapped_func, self.preservesPartitioning, self.is_barrier
5477 )
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5268, in _wrap_function(sc, func, deserializer, serializer, profiler)
5266 assert serializer, "serializer should not be empty"
5267 command = (func, profiler, deserializer, serializer)
-> 5268 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
5269 assert sc._jvm is not None
5270 return sc._jvm.SimplePythonFunction(
5271 bytearray(pickled_command),
5272 env,
(...) 5277 sc._javaAccumulator,
5278 )
File C:\spark\spark_env\Lib\site-packages\pyspark\rdd.py:5251, in _prepare_for_python_RDD(sc, command)
5248 def _prepare_for_python_RDD(sc: "SparkContext", command: Any) -> Tuple[bytes, Any, Any, Any]:
5249 # the serialized command will be compressed by broadcast
5250 ser = CloudPickleSerializer()
-> 5251 pickled_command = ser.dumps(command)
5252 assert sc._jvm is not None
5253 if len(pickled_command) > sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M
5254 # The broadcast will have same life cycle as created PythonRDD
File C:\spark\spark_env\Lib\site-packages\pyspark\serializers.py:469, in CloudPickleSerializer.dumps(self, obj)
467 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
468 print_exec(sys.stderr)
--> 469 raise pickle.PicklingError(msg)
PicklingError: Could not serialize object: RecursionError: Stack overflow (used 2912 kB)
Will appreciate any help or advice.