CPATER 4 - Basic of Spark DataFrames¶

# prepare for installation of pyspark by findspark 
import findspark
findspark.init('/home/yoshi-1/spark-3.1.1-bin-hadoop2.7')

from pyspark.sql import SparkSession

# create spark session
spark = SparkSession.builder.appName('myFirstSparkSession').getOrCreate()

df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/employee.csv',
    header=True,
    inferSchema=True)

df.show()

+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington|   10|
|       A003|         Jeff| 56|Washington|   11|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

df.columns

['employee_id', 'employee_name', 'age', 'location', 'hours']

df.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- hours: integer (nullable = true)

df.select(['employee_name', 'age', 'location']).show()

+-------------+---+----------+
|employee_name|age|  location|
+-------------+---+----------+
|       Pichai| 47|California|
|         Bill| 64|Washington|
|         Jeff| 56|Washington|
|         Cook| 59|California|
+-------------+---+----------+

df_new = df.withColumn('overtime_time', df.hours * 3)

df_new.show()

+-----------+-------------+---+----------+-----+-------------+
|employee_id|employee_name|age|  location|hours|overtime_time|
+-----------+-------------+---+----------+-----+-------------+
|       G001|       Pichai| 47|California|   14|           42|
|       M002|         Bill| 64|Washington|   10|           30|
|       A003|         Jeff| 56|Washington|   11|           33|
|       A004|         Cook| 59|California|   12|           36|
+-----------+-------------+---+----------+-----+-------------+

df_rename = df.withColumnRenamed('hours', 'working_hours')

df_rename.show()

+-----------+-------------+---+----------+-------------+
|employee_id|employee_name|age|  location|working_hours|
+-----------+-------------+---+----------+-------------+
|       G001|       Pichai| 47|California|           14|
|       M002|         Bill| 64|Washington|           10|
|       A003|         Jeff| 56|Washington|           11|
|       A004|         Cook| 59|California|           12|
+-----------+-------------+---+----------+-------------+

df_new = df.drop("working_hours")

df.head(3)

[Row(employee_id='G001', employee_name='Pichai', age=47, location='California', hours=14),
 Row(employee_id='M002', employee_name='Bill', age=64, location='Washington', hours=10),
 Row(employee_id='A003', employee_name='Jeff', age=56, location='Washington', hours=11)]

df.describe().show()

+-------+-----------+-------------+----------------+----------+-----------------+
|summary|employee_id|employee_name|             age|  location|            hours|
+-------+-----------+-------------+----------------+----------+-----------------+
|  count|          4|            4|               4|         4|                4|
|   mean|       null|         null|            56.5|      null|            11.75|
| stddev|       null|         null|7.14142842854285|      null|1.707825127659933|
|    min|       A003|         Bill|              47|California|               10|
|    max|       M002|       Pichai|              64|Washington|               14|
+-------+-----------+-------------+----------------+----------+-----------------+

SQLによる抽出¶

# sparkセッション作成
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

# データ読み込み
df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/employee.csv',
    header=True,
    inferSchema=True)

# creating a view called associates
# dfをassociatesという名のビューに変換
df.createOrReplaceTempView("associates")

# sqlによる抽出
sql_result_1 = spark.sql("SELECT* FROM associates")

sql_result_1.show()

+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington|   10|
|       A003|         Jeff| 56|Washington|   11|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

# SQLによる抽出２
sql_result_2 = spark.sql("SELECT * FROM associates WHERE age BETWEEN 45 AND 60 AND location='California'")

sql_result_2.show()

+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

df.show()

+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington|   10|
|       A003|         Jeff| 56|Washington|   11|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

# SQLによる列の追加
sql = '''
    SELECT
        employee_id,
        employee_name,
        age,
        hours,
        hours + 100 as overtime_work
    FROM
        associates
'''

sql_result_3 = spark.sql(sql)

sql_result_3.show()

+-----------+-------------+---+-----+-------------+
|employee_id|employee_name|age|hours|overtime_work|
+-----------+-------------+---+-----+-------------+
|       G001|       Pichai| 47|   14|          114|
|       M002|         Bill| 64|   10|          110|
|       A003|         Jeff| 56|   11|          111|
|       A004|         Cook| 59|   12|          112|
+-----------+-------------+---+-----+-------------+

filtering¶

spark = SparkSession.builder.appName("SparkFilter").getOrCreate()

df = spark.read.csv(
        '/home/yoshi-1/ダウンロード/spark-with-python-master/items_bought.csv',
        header=True,
        inferSchema=True)
df.show(4)

+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|11-10-2018|     Beer|     110.5|       2|     53.04|      163.54|
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
|05-10-2018|      Rum|     550.0|       2|     264.0|       814.0|
+----------+---------+----------+--------+----------+------------+
only showing top 4 rows

df.filter("total_amount > 1500").show()

+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
+----------+---------+----------+--------+----------+------------+

df.filter((df["item_price"]>1000) & (df['tax_amount']>500)).show()

+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
+----------+---------+----------+--------+----------+------------+

result_data = df.filter((df['total_amount'] == 1924.74)).collect()

type(result_data)

list

result_data

[Row(date='23-03-2020', item_name='Whisky', item_price=1300.5, quantity=2, tax_amount=624.24, total_amount=1924.74)]

len(result_data)

1

result_data[0].asDict()

{'date': '23-03-2020',
 'item_name': 'Whisky',
 'item_price': 1300.5,
 'quantity': 2,
 'tax_amount': 624.24,
 'total_amount': 1924.74}

groupby / aggregate / orderby¶

spark = SparkSession.builder.appName('sparkGroupBy&Agg').getOrCreate()

df = spark.read.csv(
        '/home/yoshi-1/ダウンロード/spark-with-python-master/company_product_revenue.csv',
        header=True,
        inferSchema=True)

df.show()

+-------------+------------+-------------+
| company_name|product_name|revenue_sales|
+-------------+------------+-------------+
|         Audi|          A4|          450|
|Mercedes Benz|     G Class|         1200|
|          BMW|          X1|          425|
|     Mahindra|     XUV 500|          850|
|          Kia|      Seltos|          690|
|         Audi|          A6|          600|
|         Audi|          Q5|          725|
|         Audi|          Q7|          500|
|          BMW|          X3|          700|
|          BMW|          X5|          850|
|Mercedes Benz|     C Class|          470|
|Mercedes Benz|         GLS|          900|
|          Kia|    Carnival|          450|
|     Mahindra|     XUV 300|          790|
+-------------+------------+-------------+

df.groupBy('company_name').sum().show()

+-------------+------------------+
| company_name|sum(revenue_sales)|
+-------------+------------------+
|          Kia|              1140|
|         Audi|              2275|
|     Mahindra|              1640|
|          BMW|              1975|
|Mercedes Benz|              2570|
+-------------+------------------+

df.agg({'revenue_sales': 'sum'}).show()

+------------------+
|sum(revenue_sales)|
+------------------+
|              9600|
+------------------+

df.groupBy('company_name').max().show()

+-------------+------------------+
| company_name|max(revenue_sales)|
+-------------+------------------+
|          Kia|               690|
|         Audi|               725|
|     Mahindra|               850|
|          BMW|               850|
|Mercedes Benz|              1200|
+-------------+------------------+

df.groupBy('company_name').agg({'revenue_sales': 'max'}).show()

+-------------+------------------+
| company_name|max(revenue_sales)|
+-------------+------------------+
|          Kia|               690|
|         Audi|               725|
|     Mahindra|               850|
|          BMW|               850|
|Mercedes Benz|              1200|
+-------------+------------------+

df.orderBy('revenue_sales').show()

+-------------+------------+-------------+
| company_name|product_name|revenue_sales|
+-------------+------------+-------------+
|          BMW|          X1|          425|
|         Audi|          A4|          450|
|          Kia|    Carnival|          450|
|Mercedes Benz|     C Class|          470|
|         Audi|          Q7|          500|
|         Audi|          A6|          600|
|          Kia|      Seltos|          690|
|          BMW|          X3|          700|
|         Audi|          Q5|          725|
|     Mahindra|     XUV 300|          790|
|          BMW|          X5|          850|
|     Mahindra|     XUV 500|          850|
|Mercedes Benz|         GLS|          900|
|Mercedes Benz|     G Class|         1200|
+-------------+------------+-------------+

df.orderBy(df['revenue_sales'].desc()).show()

+-------------+------------+-------------+
| company_name|product_name|revenue_sales|
+-------------+------------+-------------+
|Mercedes Benz|     G Class|         1200|
|Mercedes Benz|         GLS|          900|
|     Mahindra|     XUV 500|          850|
|          BMW|          X5|          850|
|     Mahindra|     XUV 300|          790|
|         Audi|          Q5|          725|
|          BMW|          X3|          700|
|          Kia|      Seltos|          690|
|         Audi|          A6|          600|
|         Audi|          Q7|          500|
|Mercedes Benz|     C Class|          470|
|         Audi|          A4|          450|
|          Kia|    Carnival|          450|
|          BMW|          X1|          425|
+-------------+------------+-------------+

Using Standard Function¶

from pyspark.sql.functions import mean, avg, format_number

spark = SparkSession.builder.appName("SparkInbuildFunctions").getOrCreate()

df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/company_product_revenue.csv',
    header=True,
    inferSchema=True
    )

df.select(mean("revenue_sales").alias('Mean Revenue Sales')).show()

+------------------+
|Mean Revenue Sales|
+------------------+
| 685.7142857142857|
+------------------+

result_avg = df.select(avg("revenue_sales").alias("Average Revenue Sales"))

print("Average Revenue Sales value is {0}".format(result_avg.head()[0]))

Average Revenue Sales value is 685.7142857142857

result_avg.show()

+---------------------+
|Average Revenue Sales|
+---------------------+
|    685.7142857142857|
+---------------------+

result_avg.select(format_number("Average Revenue Sales", 2).alias("Formatted Average")).show()

+-----------------+
|Formatted Average|
+-----------------+
|           685.71|
+-----------------+

Dealing wiht Missing Data in Spark Dataframes¶

Dropping the rows or data points that contains null values¶

spark = SparkSession.builder.appName("SparkMisingData").getOrCreate()

df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/employee_data.csv',
    header=True,
    inferSchema=True)

df.show()

+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       M002|         Bill|  64|Washington| null|
|       A003|         Jeff|  56|      null| null|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

print("Data after dropping the rows having null values")
df.na.drop().show()

Data after dropping the rows having null values
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
+-----------+-------------+---+----------+-----+

print('Data after droppingthe rows having atleast 4 non-null values')
df.na.drop(thresh=4).show()

Data after droppingthe rows having atleast 4 non-null values
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington| null|
+-----------+-------------+---+----------+-----+

print('Data after dropping the rows having null values in hours column')
df.na.drop(subset='hours').show()

Data after dropping the rows having null values in hours column
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

print('Datya after filling the rows having null values in hours columns')
df.na.fill(12, subset='hours').show()

Datya after filling the rows having null values in hours columns
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       M002|         Bill|  64|Washington|   12|
|       A003|         Jeff|  56|      null|   12|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

from pyspark.sql.functions import mean

mean_value = df.select(mean('hours'))

mean_value.collect()[0]

Row(avg(hours)=13.0)

mean_value.collect()[0][0]

13.0

mean_value = mean_value.collect()[0][0]

df.na.fill(mean_value, subset='hours').show()

+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       M002|         Bill|  64|Washington|   13|
|       A003|         Jeff|  56|      null|   13|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

df.na.replace("Pichai", "Sundar", subset="employee_name").show()

+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Sundar|  47|California|   14|
|       M002|         Bill|  64|Washington| null|
|       A003|         Jeff|  56|      null| null|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

Working with Date & Time in Spark Dataframe¶

spark = SparkSession.builder.appName("SparkDateTime").getOrCreate()

df = spark.read.csv('/home/yoshi-1/ダウンロード/spark-with-python-master/items_bought.csv',
                   header=True,
                   inferSchema=True)

df.show()

+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|11-10-2018|     Beer|     110.5|       2|     53.04|      163.54|
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
|05-10-2018|      Rum|     550.0|       2|     264.0|       814.0|
|07-05-2019|      Rum|     555.0|       2|     266.4|       821.4|
|06-10-2020|      Rum|     590.0|       3|     424.8|      1014.8|
|22-03-2020|      Gin|     400.0|       1|      96.0|       496.0|
|11-10-2019|     Beer|     130.0|       4|     124.8|       254.8|
|14-02-2020|    Vodka|     750.0|       2|     360.0|      1110.0|
+----------+---------+----------+--------+----------+------------+

df.printSchema()

root
 |-- date: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)

from pyspark.sql.functions import unix_timestamp, from_unixtime, to_date

# cast()メソッドで、列の型を変換
updated_df = df.withColumn('formatted_date', to_date(unix_timestamp(df['date'], 'dd-MM-yyyy').cast('timestamp')))

print("Schema with date column string datetype converted to date datatype")
updated_df.show()

Schema with date column string datetype converted to date datatype
+----------+---------+----------+--------+----------+------------+--------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|formatted_date|
+----------+---------+----------+--------+----------+------------+--------------+
|11-10-2018|     Beer|     110.5|       2|     53.04|      163.54|    2018-10-11|
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|    2018-02-14|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|    2020-03-23|
|05-10-2018|      Rum|     550.0|       2|     264.0|       814.0|    2018-10-05|
|07-05-2019|      Rum|     555.0|       2|     266.4|       821.4|    2019-05-07|
|06-10-2020|      Rum|     590.0|       3|     424.8|      1014.8|    2020-10-06|
|22-03-2020|      Gin|     400.0|       1|      96.0|       496.0|    2020-03-22|
|11-10-2019|     Beer|     130.0|       4|     124.8|       254.8|    2019-10-11|
|14-02-2020|    Vodka|     750.0|       2|     360.0|      1110.0|    2020-02-14|
+----------+---------+----------+--------+----------+------------+--------------+

updated_df.printSchema()

root
 |-- date: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- formatted_date: date (nullable = true)

updated_df = updated_df.drop('date')

updated_df.show(3)

+---------+----------+--------+----------+------------+--------------+
|item_name|item_price|quantity|tax_amount|total_amount|formatted_date|
+---------+----------+--------+----------+------------+--------------+
|     Beer|     110.5|       2|     53.04|      163.54|    2018-10-11|
|   Whisky|    1250.0|       1|     300.0|      1550.0|    2018-02-14|
|   Whisky|    1300.5|       2|    624.24|     1924.74|    2020-03-23|
+---------+----------+--------+----------+------------+--------------+
only showing top 3 rows

from pyspark.sql.functions import weekofyear, dayofmonth, month, year, date_format

print("Data Extraction from dates")
final_df = updated_df.select(updated_df["item_name"],
                            updated_df["total_amount"],
                            weekofyear(updated_df["formatted_date"]).alias("week_number"),
                            dayofmonth(updated_df["formatted_date"]).alias("day_number"),
                            month(updated_df["formatted_date"]).alias("month"),
                            year(updated_df["formatted_date"]).alias("year"))

Data Extraction from dates

final_df.show()

+---------+------------+-----------+----------+-----+----+
|item_name|total_amount|week_number|day_number|month|year|
+---------+------------+-----------+----------+-----+----+
|     Beer|      163.54|         41|        11|   10|2018|
|   Whisky|      1550.0|          7|        14|    2|2018|
|   Whisky|     1924.74|         13|        23|    3|2020|
|      Rum|       814.0|         40|         5|   10|2018|
|      Rum|       821.4|         19|         7|    5|2019|
|      Rum|      1014.8|         41|         6|   10|2020|
|      Gin|       496.0|         12|        22|    3|2020|
|     Beer|       254.8|         41|        11|   10|2019|
|    Vodka|      1110.0|          7|        14|    2|2020|
+---------+------------+-----------+----------+-----+----+

# converting date type to a different date format string
date_string_value = updated_df.select(df["item_name"], date_format(updated_df["formatted_date"], 'MM/dd/yyyy'))

date_string_value.show()

+---------+---------------------------------------+
|item_name|date_format(formatted_date, MM/dd/yyyy)|
+---------+---------------------------------------+
|     Beer|                             10/11/2018|
|   Whisky|                             02/14/2018|
|   Whisky|                             03/23/2020|
|      Rum|                             10/05/2018|
|      Rum|                             05/07/2019|
|      Rum|                             10/06/2020|
|      Gin|                             03/22/2020|
|     Beer|                             10/11/2019|
|    Vodka|                             02/14/2020|
+---------+---------------------------------------+

date_string_value.printSchema()

root
 |-- item_name: string (nullable = true)
 |-- date_format(formatted_date, MM/dd/yyyy): string (nullable = true)

final_df.show()

+---------+------------+-----------+----------+-----+----+
|item_name|total_amount|week_number|day_number|month|year|
+---------+------------+-----------+----------+-----+----+
|     Beer|      163.54|         41|        11|   10|2018|
|   Whisky|      1550.0|          7|        14|    2|2018|
|   Whisky|     1924.74|         13|        23|    3|2020|
|      Rum|       814.0|         40|         5|   10|2018|
|      Rum|       821.4|         19|         7|    5|2019|
|      Rum|      1014.8|         41|         6|   10|2020|
|      Gin|       496.0|         12|        22|    3|2020|
|     Beer|       254.8|         41|        11|   10|2019|
|    Vodka|      1110.0|          7|        14|    2|2020|
+---------+------------+-----------+----------+-----+----+

final_df.groupBy("year").sum().show()

+----+-----------------+----------------+---------------+----------+---------+
|year|sum(total_amount)|sum(week_number)|sum(day_number)|sum(month)|sum(year)|
+----+-----------------+----------------+---------------+----------+---------+
|2018|          2527.54|              88|             30|        22|     6054|
|2019|           1076.2|              60|             18|        15|     4038|
|2020|          4545.54|              73|             65|        18|     8080|
+----+-----------------+----------------+---------------+----------+---------+

print("Usecae - Total amount of items purchased in that particular year")
final_format = final_df.groupBy("year").sum().select(["year", "sum(total_amount)"])
final_format.withColumnRenamed("sum(total_amount)", "Total Expenditure").show()

Usecae - Total amount of items purchased in that particular year
+----+-----------------+
|year|Total Expenditure|
+----+-----------------+
|2018|          2527.54|
|2019|           1076.2|
|2020|          4545.54|
+----+-----------------+