I have a use case where I want to replace nulls in struct type columns with empty values.
Below is sample which you can use to recreate the scenario:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data = [(1, [1, 'Aman']), (2, [2, "Raman"]),(3, [3, "Baman"]),(4,None)]
schema = (StructType([
StructField("ID",IntegerType(),True),
StructField("Name",StructType([
StructField("NameID",IntegerType(),True),
StructField("FirstName",StringType(),True),
]),True)
]))
df = spark.createDataFrame(data, schema)
df.show()
I want to know how we can replace nulls in Name column.
>Solution :
Using when
expression:
from pyspark.sql import functions as F
df1 = df.withColumn(
"Name",
F.when(
F.col("Name").isNull(),
F.struct(
F.lit(None).alias("NameID"), #replace None by default ID value if needed
F.lit("").alias("FirstName")
)
).otherwise(F.col("Name"))
)
df1.show()
#+---+----------+
#| ID| Name|
#+---+----------+
#| 1| {1, Aman}|
#| 2|{2, Raman}|
#| 3|{3, Baman}|
#| 4| {null, }|
#+---+----------+