# prepare for installation of pyspark by findspark
import findspark
findspark.init('/home/yoshi-1/spark-3.1.1-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('K-MeansClustring').getOrCreate()
data = spark.read.csv('latitude_longitude.csv', header=True, inferSchema=True)
data.show(4)
data.count()
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['latitude', 'longitude'], outputCol='features')
final_assembled_data = assembler.transform(data)
print("Consolidated Data with features")
final_assembled_data.show(4)
# Since Initial Data is well scaled, we can pass it directly to our K-Means
kmeans = KMeans(featuresCol='features', k=3)
kmeans_model = kmeans.fit(final_assembled_data)
predictions = kmeans_model.transform(final_assembled_data)
predictions.show(4)
print("Prediction Data")
# Determining the centroids of the cluster
centres = kmeans_model.clusterCenters()
print("The company can setup 3 of their towers at these locations- latitudes and longitudes for optimal network coverage")
cluster_list = []
i = 1
for centre in centres:
print("{} - {}".format(i, centre))
i = i + 1
print("\nDetermining tghe number of users that belongs to each clusters")
predictions.groupBy('prediction').count().show()
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator_object = ClusteringEvaluator(predictionCol='prediction', featuresCol='features')
Silhouette_Score = evaluator_object.evaluate(predictions)
print("The Silhouette Score when k=3 is {}".format(Silhouette_Score))
# print("\nWithin set Sum of Square Error {}\n".format(kmeans_model.computeCost(final_assembled_data)))
from pyspark.ml.feature import StandardScaler
# transform to [mean:0, variance:1]
scalar_object = StandardScaler(inputCol='features', outputCol='ScaledFeatures')
scalar_model = scalar_object.fit(final_assembled_data)
final_scaled_data = scalar_model.transform(final_assembled_data)
print("Consolidated Data with Scaled Features")
final_scaled_data.show(4)
scaled_kmeans = KMeans(featuresCol='ScaledFeatures', k=5)
scaled_kmeans_model = scaled_kmeans.fit(final_scaled_data)
scaled_predictions = scaled_kmeans_model.transform(final_scaled_data)
print("Prediction Data")
scaled_predictions.select('latitude', 'longitude', 'ScaledFeatures', 'prediction').show(4)
scaled_centres = scaled_kmeans_model.clusterCenters()
print("Scaled Tower Locations \n{}".format(scaled_centres))
Scaled_Silhouette_Score = evaluator_object.evaluate(scaled_predictions)
print("\nThe Silhouette Score when k=5 is {}".format(Scaled_Silhouette_Score))
print("Determining the number of users that belongs to each clusters")
scaled_predictions.groupBy('prediction').count().show()