You can create a udf like so:
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
def gini(list_of_values):
# sth is processing here
return number_output
udf_gini = F.udf(gini, FloatType())
df.groupby('activity')\
.agg(F.collect_list("mean_event_duration_in_hours").alias("event_duration_list"))\
.withColumn("gini", udf_gini(F.col("event_duration_list")))
Or define gini as a UDF like this:
@udf(returnType=FloatType())
def gini(list_of_values):
# sth is processing here
return number_output
df.groupby('activity')\
.agg(F.collect_list("mean_event_duration_in_hours").alias("event_duration_list"))\
.withColumn("gini", gini(F.col("event_duration_list")))