CPATER 4 - Basic of Spark DataFrames

In [1]:
# prepare for installation of pyspark by findspark 
import findspark
findspark.init('/home/yoshi-1/spark-3.1.1-bin-hadoop2.7')
In [2]:
from pyspark.sql import SparkSession
In [3]:
# create spark session
spark = SparkSession.builder.appName('myFirstSparkSession').getOrCreate()
In [4]:
df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/employee.csv',
    header=True,
    inferSchema=True)
In [5]:
df.show()
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington|   10|
|       A003|         Jeff| 56|Washington|   11|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

In [6]:
df.columns
Out[6]:
['employee_id', 'employee_name', 'age', 'location', 'hours']
In [7]:
df.printSchema()
root
 |-- employee_id: string (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- hours: integer (nullable = true)

In [8]:
df.select(['employee_name', 'age', 'location']).show()
+-------------+---+----------+
|employee_name|age|  location|
+-------------+---+----------+
|       Pichai| 47|California|
|         Bill| 64|Washington|
|         Jeff| 56|Washington|
|         Cook| 59|California|
+-------------+---+----------+

In [9]:
df_new = df.withColumn('overtime_time', df.hours * 3)
In [10]:
df_new.show()
+-----------+-------------+---+----------+-----+-------------+
|employee_id|employee_name|age|  location|hours|overtime_time|
+-----------+-------------+---+----------+-----+-------------+
|       G001|       Pichai| 47|California|   14|           42|
|       M002|         Bill| 64|Washington|   10|           30|
|       A003|         Jeff| 56|Washington|   11|           33|
|       A004|         Cook| 59|California|   12|           36|
+-----------+-------------+---+----------+-----+-------------+

In [11]:
df_rename = df.withColumnRenamed('hours', 'working_hours')
In [12]:
df_rename.show()
+-----------+-------------+---+----------+-------------+
|employee_id|employee_name|age|  location|working_hours|
+-----------+-------------+---+----------+-------------+
|       G001|       Pichai| 47|California|           14|
|       M002|         Bill| 64|Washington|           10|
|       A003|         Jeff| 56|Washington|           11|
|       A004|         Cook| 59|California|           12|
+-----------+-------------+---+----------+-------------+

In [13]:
df_new = df.drop("working_hours")
In [14]:
df.head(3)
Out[14]:
[Row(employee_id='G001', employee_name='Pichai', age=47, location='California', hours=14),
 Row(employee_id='M002', employee_name='Bill', age=64, location='Washington', hours=10),
 Row(employee_id='A003', employee_name='Jeff', age=56, location='Washington', hours=11)]
In [15]:
df.describe().show()
+-------+-----------+-------------+----------------+----------+-----------------+
|summary|employee_id|employee_name|             age|  location|            hours|
+-------+-----------+-------------+----------------+----------+-----------------+
|  count|          4|            4|               4|         4|                4|
|   mean|       null|         null|            56.5|      null|            11.75|
| stddev|       null|         null|7.14142842854285|      null|1.707825127659933|
|    min|       A003|         Bill|              47|California|               10|
|    max|       M002|       Pichai|              64|Washington|               14|
+-------+-----------+-------------+----------------+----------+-----------------+

SQLによる抽出

In [16]:
# sparkセッション作成
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
In [17]:
# データ読み込み
df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/employee.csv',
    header=True,
    inferSchema=True)
In [18]:
# creating a view called associates
# dfをassociatesという名のビューに変換
df.createOrReplaceTempView("associates")
In [19]:
# sqlによる抽出
sql_result_1 = spark.sql("SELECT* FROM associates")
In [20]:
sql_result_1.show()
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington|   10|
|       A003|         Jeff| 56|Washington|   11|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

In [21]:
# SQLによる抽出2
sql_result_2 = spark.sql("SELECT * FROM associates WHERE age BETWEEN 45 AND 60 AND location='California'")
In [22]:
sql_result_2.show()
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

In [23]:
df.show()
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington|   10|
|       A003|         Jeff| 56|Washington|   11|
|       A004|         Cook| 59|California|   12|
+-----------+-------------+---+----------+-----+

In [24]:
# SQLによる列の追加
sql = '''
    SELECT
        employee_id,
        employee_name,
        age,
        hours,
        hours + 100 as overtime_work
    FROM
        associates
'''
In [25]:
sql_result_3 = spark.sql(sql)
In [26]:
sql_result_3.show()
+-----------+-------------+---+-----+-------------+
|employee_id|employee_name|age|hours|overtime_work|
+-----------+-------------+---+-----+-------------+
|       G001|       Pichai| 47|   14|          114|
|       M002|         Bill| 64|   10|          110|
|       A003|         Jeff| 56|   11|          111|
|       A004|         Cook| 59|   12|          112|
+-----------+-------------+---+-----+-------------+

In [ ]:
 

filtering

In [28]:
spark = SparkSession.builder.appName("SparkFilter").getOrCreate()
In [29]:
df = spark.read.csv(
        '/home/yoshi-1/ダウンロード/spark-with-python-master/items_bought.csv',
        header=True,
        inferSchema=True)
df.show(4)
+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|11-10-2018|     Beer|     110.5|       2|     53.04|      163.54|
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
|05-10-2018|      Rum|     550.0|       2|     264.0|       814.0|
+----------+---------+----------+--------+----------+------------+
only showing top 4 rows

In [30]:
df.filter("total_amount > 1500").show()
+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
+----------+---------+----------+--------+----------+------------+

In [31]:
df.filter((df["item_price"]>1000) & (df['tax_amount']>500)).show()
+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
+----------+---------+----------+--------+----------+------------+

In [32]:
result_data = df.filter((df['total_amount'] == 1924.74)).collect()
In [33]:
type(result_data)
Out[33]:
list
In [34]:
result_data
Out[34]:
[Row(date='23-03-2020', item_name='Whisky', item_price=1300.5, quantity=2, tax_amount=624.24, total_amount=1924.74)]
In [35]:
len(result_data)
Out[35]:
1
In [36]:
result_data[0].asDict()
Out[36]:
{'date': '23-03-2020',
 'item_name': 'Whisky',
 'item_price': 1300.5,
 'quantity': 2,
 'tax_amount': 624.24,
 'total_amount': 1924.74}
In [ ]:
 

groupby / aggregate / orderby

In [37]:
spark = SparkSession.builder.appName('sparkGroupBy&Agg').getOrCreate()
In [38]:
df = spark.read.csv(
        '/home/yoshi-1/ダウンロード/spark-with-python-master/company_product_revenue.csv',
        header=True,
        inferSchema=True)
In [39]:
df.show()
+-------------+------------+-------------+
| company_name|product_name|revenue_sales|
+-------------+------------+-------------+
|         Audi|          A4|          450|
|Mercedes Benz|     G Class|         1200|
|          BMW|          X1|          425|
|     Mahindra|     XUV 500|          850|
|          Kia|      Seltos|          690|
|         Audi|          A6|          600|
|         Audi|          Q5|          725|
|         Audi|          Q7|          500|
|          BMW|          X3|          700|
|          BMW|          X5|          850|
|Mercedes Benz|     C Class|          470|
|Mercedes Benz|         GLS|          900|
|          Kia|    Carnival|          450|
|     Mahindra|     XUV 300|          790|
+-------------+------------+-------------+

In [40]:
df.groupBy('company_name').sum().show()
+-------------+------------------+
| company_name|sum(revenue_sales)|
+-------------+------------------+
|          Kia|              1140|
|         Audi|              2275|
|     Mahindra|              1640|
|          BMW|              1975|
|Mercedes Benz|              2570|
+-------------+------------------+

In [41]:
df.agg({'revenue_sales': 'sum'}).show()
+------------------+
|sum(revenue_sales)|
+------------------+
|              9600|
+------------------+

In [42]:
df.groupBy('company_name').max().show()
+-------------+------------------+
| company_name|max(revenue_sales)|
+-------------+------------------+
|          Kia|               690|
|         Audi|               725|
|     Mahindra|               850|
|          BMW|               850|
|Mercedes Benz|              1200|
+-------------+------------------+

In [43]:
df.groupBy('company_name').agg({'revenue_sales': 'max'}).show()
+-------------+------------------+
| company_name|max(revenue_sales)|
+-------------+------------------+
|          Kia|               690|
|         Audi|               725|
|     Mahindra|               850|
|          BMW|               850|
|Mercedes Benz|              1200|
+-------------+------------------+

In [44]:
df.orderBy('revenue_sales').show()
+-------------+------------+-------------+
| company_name|product_name|revenue_sales|
+-------------+------------+-------------+
|          BMW|          X1|          425|
|         Audi|          A4|          450|
|          Kia|    Carnival|          450|
|Mercedes Benz|     C Class|          470|
|         Audi|          Q7|          500|
|         Audi|          A6|          600|
|          Kia|      Seltos|          690|
|          BMW|          X3|          700|
|         Audi|          Q5|          725|
|     Mahindra|     XUV 300|          790|
|          BMW|          X5|          850|
|     Mahindra|     XUV 500|          850|
|Mercedes Benz|         GLS|          900|
|Mercedes Benz|     G Class|         1200|
+-------------+------------+-------------+

In [45]:
df.orderBy(df['revenue_sales'].desc()).show()
+-------------+------------+-------------+
| company_name|product_name|revenue_sales|
+-------------+------------+-------------+
|Mercedes Benz|     G Class|         1200|
|Mercedes Benz|         GLS|          900|
|     Mahindra|     XUV 500|          850|
|          BMW|          X5|          850|
|     Mahindra|     XUV 300|          790|
|         Audi|          Q5|          725|
|          BMW|          X3|          700|
|          Kia|      Seltos|          690|
|         Audi|          A6|          600|
|         Audi|          Q7|          500|
|Mercedes Benz|     C Class|          470|
|         Audi|          A4|          450|
|          Kia|    Carnival|          450|
|          BMW|          X1|          425|
+-------------+------------+-------------+

Using Standard Function

In [46]:
from pyspark.sql.functions import mean, avg, format_number
In [47]:
spark = SparkSession.builder.appName("SparkInbuildFunctions").getOrCreate()
In [48]:
df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/company_product_revenue.csv',
    header=True,
    inferSchema=True
    )
In [49]:
df.select(mean("revenue_sales").alias('Mean Revenue Sales')).show()
+------------------+
|Mean Revenue Sales|
+------------------+
| 685.7142857142857|
+------------------+

In [50]:
result_avg = df.select(avg("revenue_sales").alias("Average Revenue Sales"))
In [51]:
print("Average Revenue Sales value is {0}".format(result_avg.head()[0]))
Average Revenue Sales value is 685.7142857142857
In [52]:
result_avg.show()
+---------------------+
|Average Revenue Sales|
+---------------------+
|    685.7142857142857|
+---------------------+

In [53]:
result_avg.select(format_number("Average Revenue Sales", 2).alias("Formatted Average")).show()
+-----------------+
|Formatted Average|
+-----------------+
|           685.71|
+-----------------+

In [ ]:
 

Dealing wiht Missing Data in Spark Dataframes

Dropping the rows or data points that contains null values

In [54]:
spark = SparkSession.builder.appName("SparkMisingData").getOrCreate()
In [55]:
df = spark.read.csv(
    '/home/yoshi-1/ダウンロード/spark-with-python-master/employee_data.csv',
    header=True,
    inferSchema=True)
In [56]:
df.show()
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       M002|         Bill|  64|Washington| null|
|       A003|         Jeff|  56|      null| null|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

In [57]:
print("Data after dropping the rows having null values")
df.na.drop().show()
Data after dropping the rows having null values
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
+-----------+-------------+---+----------+-----+

In [58]:
print('Data after droppingthe rows having atleast 4 non-null values')
df.na.drop(thresh=4).show()
Data after droppingthe rows having atleast 4 non-null values
+-----------+-------------+---+----------+-----+
|employee_id|employee_name|age|  location|hours|
+-----------+-------------+---+----------+-----+
|       G001|       Pichai| 47|California|   14|
|       M002|         Bill| 64|Washington| null|
+-----------+-------------+---+----------+-----+

In [60]:
print('Data after dropping the rows having null values in hours column')
df.na.drop(subset='hours').show()
Data after dropping the rows having null values in hours column
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

In [59]:
print('Datya after filling the rows having null values in hours columns')
df.na.fill(12, subset='hours').show()
Datya after filling the rows having null values in hours columns
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       M002|         Bill|  64|Washington|   12|
|       A003|         Jeff|  56|      null|   12|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

In [ ]:
from pyspark.sql.functions import mean
In [61]:
mean_value = df.select(mean('hours'))
In [62]:
mean_value.collect()[0]
Out[62]:
Row(avg(hours)=13.0)
In [63]:
mean_value.collect()[0][0]
Out[63]:
13.0
In [64]:
mean_value = mean_value.collect()[0][0]
In [65]:
df.na.fill(mean_value, subset='hours').show()
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Pichai|  47|California|   14|
|       M002|         Bill|  64|Washington|   13|
|       A003|         Jeff|  56|      null|   13|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

In [66]:
df.na.replace("Pichai", "Sundar", subset="employee_name").show()
+-----------+-------------+----+----------+-----+
|employee_id|employee_name| age|  location|hours|
+-----------+-------------+----+----------+-----+
|       G001|       Sundar|  47|California|   14|
|       M002|         Bill|  64|Washington| null|
|       A003|         Jeff|  56|      null| null|
|       A004|         null|null|      null|   12|
+-----------+-------------+----+----------+-----+

In [ ]:
 

Working with Date & Time in Spark Dataframe

In [3]:
spark = SparkSession.builder.appName("SparkDateTime").getOrCreate()
In [4]:
df = spark.read.csv('/home/yoshi-1/ダウンロード/spark-with-python-master/items_bought.csv',
                   header=True,
                   inferSchema=True)
In [5]:
df.show()
+----------+---------+----------+--------+----------+------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|
+----------+---------+----------+--------+----------+------------+
|11-10-2018|     Beer|     110.5|       2|     53.04|      163.54|
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|
|05-10-2018|      Rum|     550.0|       2|     264.0|       814.0|
|07-05-2019|      Rum|     555.0|       2|     266.4|       821.4|
|06-10-2020|      Rum|     590.0|       3|     424.8|      1014.8|
|22-03-2020|      Gin|     400.0|       1|      96.0|       496.0|
|11-10-2019|     Beer|     130.0|       4|     124.8|       254.8|
|14-02-2020|    Vodka|     750.0|       2|     360.0|      1110.0|
+----------+---------+----------+--------+----------+------------+

In [6]:
df.printSchema()
root
 |-- date: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)

In [7]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, to_date
In [8]:
# cast()メソッドで、列の型を変換
updated_df = df.withColumn('formatted_date', to_date(unix_timestamp(df['date'], 'dd-MM-yyyy').cast('timestamp')))
In [9]:
print("Schema with date column string datetype converted to date datatype")
updated_df.show()
Schema with date column string datetype converted to date datatype
+----------+---------+----------+--------+----------+------------+--------------+
|      date|item_name|item_price|quantity|tax_amount|total_amount|formatted_date|
+----------+---------+----------+--------+----------+------------+--------------+
|11-10-2018|     Beer|     110.5|       2|     53.04|      163.54|    2018-10-11|
|14-02-2018|   Whisky|    1250.0|       1|     300.0|      1550.0|    2018-02-14|
|23-03-2020|   Whisky|    1300.5|       2|    624.24|     1924.74|    2020-03-23|
|05-10-2018|      Rum|     550.0|       2|     264.0|       814.0|    2018-10-05|
|07-05-2019|      Rum|     555.0|       2|     266.4|       821.4|    2019-05-07|
|06-10-2020|      Rum|     590.0|       3|     424.8|      1014.8|    2020-10-06|
|22-03-2020|      Gin|     400.0|       1|      96.0|       496.0|    2020-03-22|
|11-10-2019|     Beer|     130.0|       4|     124.8|       254.8|    2019-10-11|
|14-02-2020|    Vodka|     750.0|       2|     360.0|      1110.0|    2020-02-14|
+----------+---------+----------+--------+----------+------------+--------------+

In [10]:
updated_df.printSchema()
root
 |-- date: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- formatted_date: date (nullable = true)

In [11]:
updated_df = updated_df.drop('date')
In [12]:
updated_df.show(3)
+---------+----------+--------+----------+------------+--------------+
|item_name|item_price|quantity|tax_amount|total_amount|formatted_date|
+---------+----------+--------+----------+------------+--------------+
|     Beer|     110.5|       2|     53.04|      163.54|    2018-10-11|
|   Whisky|    1250.0|       1|     300.0|      1550.0|    2018-02-14|
|   Whisky|    1300.5|       2|    624.24|     1924.74|    2020-03-23|
+---------+----------+--------+----------+------------+--------------+
only showing top 3 rows

In [13]:
from pyspark.sql.functions import weekofyear, dayofmonth, month, year, date_format
In [22]:
print("Data Extraction from dates")
final_df = updated_df.select(updated_df["item_name"],
                            updated_df["total_amount"],
                            weekofyear(updated_df["formatted_date"]).alias("week_number"),
                            dayofmonth(updated_df["formatted_date"]).alias("day_number"),
                            month(updated_df["formatted_date"]).alias("month"),
                            year(updated_df["formatted_date"]).alias("year"))
Data Extraction from dates
In [23]:
final_df.show()
+---------+------------+-----------+----------+-----+----+
|item_name|total_amount|week_number|day_number|month|year|
+---------+------------+-----------+----------+-----+----+
|     Beer|      163.54|         41|        11|   10|2018|
|   Whisky|      1550.0|          7|        14|    2|2018|
|   Whisky|     1924.74|         13|        23|    3|2020|
|      Rum|       814.0|         40|         5|   10|2018|
|      Rum|       821.4|         19|         7|    5|2019|
|      Rum|      1014.8|         41|         6|   10|2020|
|      Gin|       496.0|         12|        22|    3|2020|
|     Beer|       254.8|         41|        11|   10|2019|
|    Vodka|      1110.0|          7|        14|    2|2020|
+---------+------------+-----------+----------+-----+----+

In [17]:
# converting date type to a different date format string
date_string_value = updated_df.select(df["item_name"], date_format(updated_df["formatted_date"], 'MM/dd/yyyy'))
In [18]:
date_string_value.show()
+---------+---------------------------------------+
|item_name|date_format(formatted_date, MM/dd/yyyy)|
+---------+---------------------------------------+
|     Beer|                             10/11/2018|
|   Whisky|                             02/14/2018|
|   Whisky|                             03/23/2020|
|      Rum|                             10/05/2018|
|      Rum|                             05/07/2019|
|      Rum|                             10/06/2020|
|      Gin|                             03/22/2020|
|     Beer|                             10/11/2019|
|    Vodka|                             02/14/2020|
+---------+---------------------------------------+

In [19]:
date_string_value.printSchema()
root
 |-- item_name: string (nullable = true)
 |-- date_format(formatted_date, MM/dd/yyyy): string (nullable = true)

In [24]:
final_df.show()
+---------+------------+-----------+----------+-----+----+
|item_name|total_amount|week_number|day_number|month|year|
+---------+------------+-----------+----------+-----+----+
|     Beer|      163.54|         41|        11|   10|2018|
|   Whisky|      1550.0|          7|        14|    2|2018|
|   Whisky|     1924.74|         13|        23|    3|2020|
|      Rum|       814.0|         40|         5|   10|2018|
|      Rum|       821.4|         19|         7|    5|2019|
|      Rum|      1014.8|         41|         6|   10|2020|
|      Gin|       496.0|         12|        22|    3|2020|
|     Beer|       254.8|         41|        11|   10|2019|
|    Vodka|      1110.0|          7|        14|    2|2020|
+---------+------------+-----------+----------+-----+----+

In [25]:
final_df.groupBy("year").sum().show()
+----+-----------------+----------------+---------------+----------+---------+
|year|sum(total_amount)|sum(week_number)|sum(day_number)|sum(month)|sum(year)|
+----+-----------------+----------------+---------------+----------+---------+
|2018|          2527.54|              88|             30|        22|     6054|
|2019|           1076.2|              60|             18|        15|     4038|
|2020|          4545.54|              73|             65|        18|     8080|
+----+-----------------+----------------+---------------+----------+---------+

In [26]:
print("Usecae - Total amount of items purchased in that particular year")
final_format = final_df.groupBy("year").sum().select(["year", "sum(total_amount)"])
final_format.withColumnRenamed("sum(total_amount)", "Total Expenditure").show()
Usecae - Total amount of items purchased in that particular year
+----+-----------------+
|year|Total Expenditure|
+----+-----------------+
|2018|          2527.54|
|2019|           1076.2|
|2020|          4545.54|
+----+-----------------+

In [ ]: