pyspark max of two columns

from pyspark.sql.types import *

schema = StructType([
    StructField("ClientId", IntegerType(), True),
    StructField("m_ant21", IntegerType(), True),
    StructField("m_ant22", IntegerType(), True),
    StructField("m_ant23", IntegerType(), True),
    StructField("m_ant24", IntegerType(), True)
])

df = spark\
    .createDataFrame(
        data=[(0, None, None, None, None),
             (1, 23, 13, 17, 99),
             (2, 0, 0, 0, 1),
             (3, 0, None, 1, 0)],
        schema=schema)

import pyspark.sql.functions as F

def agg_to_list(m21,m22,m23,m24):
    return [m21,m22,m23,m24]

u_agg_to_list = F.udf(agg_to_list, ArrayType(IntegerType()))

df2 = df.withColumn('all_values', u_agg_to_list('m_ant21', 'm_ant22', 'm_ant23', 'm_ant24'))\
        .withColumn('max', F.sort_array("all_values", False)[0])\
        .select('ClientId', 'max')

df2.show()

Posted by: Guest on April-14-2021

Source

Code answers related to "pyspark max of two columns"

Code answers related to "Whatever"

Browse Popular Code Answers by Language

Answers for "pyspark max of two columns"

Code answers related to "pyspark max of two columns"

Code answers related to "Whatever"

Browse Popular Code Answers by Language

Popular Programming Languages

Advertisements

Company

Compilers

Help

Connect with us