🔥 PySpark Tutorial: Flatten Arrays and Structs
Learn how to use explode(), inline(), and struct() in PySpark to work with nested array and struct data efficiently.
📦 Sample Data
data_array = [
  ("Aamir", ["apple", "banana", "cherry"]),
  ("Sara", ["orange", "grape"]),
  ("John", ["melon", "kiwi", "pineapple"]),
  ("Lina", ["pear", "peach"])
]
df_array = spark.createDataFrame(data_array, ["name", "fruits"])
df_array.show()Output:
+-----+------------------------+
| name|                  fruits|
+-----+------------------------+
|Aamir| [apple, banana, cherry]|
| Sara|         [orange, grape]|
| John| [melon, kiwi, pineapple]|
| Lina|           [pear, peach]|
+-----+------------------------+💥 explode() – Flatten Arrays
df_exploded = df_array.select("name", explode("fruits").alias("fruit"))
df_exploded.show()Output:
+-----+--------+
| name|   fruit|
+-----+--------+
|Aamir|  apple |
|Aamir| banana |
|Aamir| cherry |
| Sara| orange |
| Sara|  grape |
| John|  melon |
| John|   kiwi |
| John|pineapple|
| Lina|   pear |
| Lina|  peach |
+-----+--------+🧱 struct() – Create Struct from Columns
df_struct = df_array.select("name", struct("fruits").alias("fruit_struct"))
df_struct.show(truncate=False)🧩 inline() – Flatten Array of Structs
from pyspark.sql.functions import inline
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
data_struct = [
  ("Aamir", [{"fruit": "apple", "color": "red"}, {"fruit": "banana", "color": "yellow"}]),
  ("Sara", [{"fruit": "orange", "color": "orange"}]),
  ("John", [{"fruit": "melon", "color": "green"}, {"fruit": "kiwi", "color": "brown"}]),
  ("Lina", [{"fruit": "pear", "color": "green"}, {"fruit": "peach", "color": "pink"}])
]
schema = StructType([
  StructField("name", StringType(), True),
  StructField("fruits", ArrayType(StructType([
    StructField("fruit", StringType(), True),
    StructField("color", StringType(), True)
  ])), True)
])
df_struct = spark.createDataFrame(data_struct, schema)
df_inline = df_struct.select("name", inline("fruits"))
df_inline.show()


No comments:
Post a Comment
Note: Only a member of this blog may post a comment.