Working with Structs and Nested Fields in PySpark
In this PySpark tutorial, we’ll explore how to work with StructType and nested fields in DataFrames using powerful functions like getField()
, getItem()
, withField()
, and dropFields()
.
Sample Data
data = [
(1, {"first": "Aamir", "last": "Shahzad", "country": "Pakistan"}),
(2, {"first": "Ali", "last": "Raza", "country": "USA"}),
(3, {"first": "Lisa", "last": "Brown", "country": "UK"})
]
schema = "id INT, full_name STRUCT"
df = spark.createDataFrame(data, schema=schema)
df.show(truncate=False)
getField() - Access specific nested field
df.select(
col("full_name").getField("first").alias("first_name"),
col("full_name").getField("last").alias("last_name"),
col("country")
).show()
getItem() - Access fields like a dictionary
df.select(
col("full_name").getItem("first").alias("first_name_item"),
col("full_name").getItem("last").alias("last_name_item"),
col("country")
).show()
withField() - Add or update nested field
df_with_middle = df.withColumn(
"full_name",
col("full_name").withField("middle", lit("MiddleName."))
)
df_with_middle.select("id", "full_name", "country").show(truncate=False)
dropFields() - Remove a nested field
df_dropped = df_with_middle.withColumn(
"full_name",
col("full_name").dropFields("last")
)
df_dropped.select("id", "full_name", "country").show(truncate=False)
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.