Working with Structs and Nested Fields in PySpark
In this PySpark tutorial, we’ll explore how to work with StructType and nested fields in DataFrames using powerful functions like getField(), getItem(), withField(), and dropFields().
Sample Data
data = [
(1, {"first": "Aamir", "last": "Shahzad", "country": "Pakistan"}),
(2, {"first": "Ali", "last": "Raza", "country": "USA"}),
(3, {"first": "Lisa", "last": "Brown", "country": "UK"})
]
schema = "id INT, full_name STRUCT"
df = spark.createDataFrame(data, schema=schema)
df.show(truncate=False)
getField() - Access specific nested field
df.select(
col("full_name").getField("first").alias("first_name"),
col("full_name").getField("last").alias("last_name"),
col("country")
).show()
getItem() - Access fields like a dictionary
df.select(
col("full_name").getItem("first").alias("first_name_item"),
col("full_name").getItem("last").alias("last_name_item"),
col("country")
).show()
withField() - Add or update nested field
df_with_middle = df.withColumn(
"full_name",
col("full_name").withField("middle", lit("MiddleName."))
)
df_with_middle.select("id", "full_name", "country").show(truncate=False)
dropFields() - Remove a nested field
df_dropped = df_with_middle.withColumn(
"full_name",
col("full_name").dropFields("last")
)
df_dropped.select("id", "full_name", "country").show(truncate=False)



No comments:
Post a Comment
Note: Only a member of this blog may post a comment.