I have a below spark dataframe which I am using to create another dataframe with defined schema.
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|TECHNOLOGY|KPI_NAME |FUNCTIONS |DESCRIPTION|ACTION|FORMULA_VALID|VALIDITY_LOG|
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|GSM |Cell_Availability_test3|{SUM, SUM, NULL, NULL}|NULL |ADD |true |[] |
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
which has schema type below as:
root
|-- TECHNOLOGY: string (nullable = true)
|-- KPI_NAME: string (nullable = true)
|-- FUNCTIONS: struct (nullable = true)
| |-- fun_temporal: string (nullable = true)
| |-- fun_regional: string (nullable = true)
| |-- fun_temporal_unit: map (nullable = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
| |-- fun_regional_unit: map (nullable = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
|-- DESCRIPTION: string (nullable = true)
|-- ACTION: string (nullable = true)
|-- FORMULA_VALID: boolean (nullable = false)
|-- VALIDITY_LOG: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- value: string (nullable = true)
val outputTypeTest: StructType = StructType(Seq(
StructField("TECHNOLOGY", StringType, true),
StructField("KPI_NAME", StringType, true),
StructField("FUNCTIONS", StructType(Seq(
StructField("fun_temporal", StringType, true),
StructField("fun_regional", StringType, true),
StructField("fun_temporal_unit", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), false),
StructField("fun_regional_unit", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), false))), true),
StructField("DESCRIPTION", StringType, true),
StructField("ACTION", StringType, true),
StructField("FORMULA_VALID", BooleanType, true),
StructField("VALIDITY_LOG", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), false)))
val formulaMappingOutputNotTypedTest= formulaMappingOutputNotTyped.select("TECHNOLOGY","KPI_NAME","FUNCTIONS","DESCRIPTION","ACTION","FORMULA_VALID","VALIDITY_LOG")
formulaMappingOutputNotTypedTest.show(truncate = false)
val formulaMappingOutput = spark.createDataFrame(formulaMappingOutputNotTypedTest.rdd, outputTypeTest)
Caused by: java.lang.RuntimeException: The 2th field 'fun_temporal_unit' of input row cannot be null.
I have a below spark dataframe which I am using to create another dataframe with defined schema.
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|TECHNOLOGY|KPI_NAME |FUNCTIONS |DESCRIPTION|ACTION|FORMULA_VALID|VALIDITY_LOG|
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|GSM |Cell_Availability_test3|{SUM, SUM, NULL, NULL}|NULL |ADD |true |[] |
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
which has schema type below as:
root
|-- TECHNOLOGY: string (nullable = true)
|-- KPI_NAME: string (nullable = true)
|-- FUNCTIONS: struct (nullable = true)
| |-- fun_temporal: string (nullable = true)
| |-- fun_regional: string (nullable = true)
| |-- fun_temporal_unit: map (nullable = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
| |-- fun_regional_unit: map (nullable = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
|-- DESCRIPTION: string (nullable = true)
|-- ACTION: string (nullable = true)
|-- FORMULA_VALID: boolean (nullable = false)
|-- VALIDITY_LOG: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- value: string (nullable = true)
val outputTypeTest: StructType = StructType(Seq(
StructField("TECHNOLOGY", StringType, true),
StructField("KPI_NAME", StringType, true),
StructField("FUNCTIONS", StructType(Seq(
StructField("fun_temporal", StringType, true),
StructField("fun_regional", StringType, true),
StructField("fun_temporal_unit", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), false),
StructField("fun_regional_unit", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), false))), true),
StructField("DESCRIPTION", StringType, true),
StructField("ACTION", StringType, true),
StructField("FORMULA_VALID", BooleanType, true),
StructField("VALIDITY_LOG", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), false)))
val formulaMappingOutputNotTypedTest= formulaMappingOutputNotTyped.select("TECHNOLOGY","KPI_NAME","FUNCTIONS","DESCRIPTION","ACTION","FORMULA_VALID","VALIDITY_LOG")
formulaMappingOutputNotTypedTest.show(truncate = false)
val formulaMappingOutput = spark.createDataFrame(formulaMappingOutputNotTypedTest.rdd, outputTypeTest)
Caused by: java.lang.RuntimeException: The 2th field 'fun_temporal_unit' of input row cannot be null.
Looks like your fun_temporal_unit
field is defined as non-nullable, but you pass a null in your original DataFrame.
I'd suggest to either replace null values with empty array, or to define the field as nullable - whichever is more appropriate for you.
Something like (not tested, and possibly not the most elegant)
val formulaMappingOutputNotTypedTest =
formulaMappingOutputNotTyped
.select(
"TECHNOLOGY", "KPI_NAME", "FUNCTIONS", "DESCRIPTION",
"ACTION", "FORMULA_VALID", "VALIDITY_LOG"
)
// replace old field FUNCTIONS with a new one:
.withColumn(
"FUNCTIONS",
struct(
col("FUNCTIONS.fun_temporal"),
col("FUNCTIONS.fun_regional"),
// replace NULL fun_temporal_unit with an empty Array:
coalesce(col("FUNCTIONS.fun_temporal_unit"), lit(Array.empty[Row])),
col("FUNCTIONS.fun_regional_unit")
)
)
or
StructField("fun_temporal_unit", ArrayType(StructType(Seq(
StructField("key", StringType, true),
StructField("value", StringType, true))), false), true),
// ^ here
key
being nullable=undefined in this struct? – mazaneicha Commented Feb 3 at 15:49