I am using PySpark with Flask, in order to have a web service.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from flask import Flask, jsonify
from pyspark import SparkFiles
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, StringType
app = Flask(__name__)
# DEFINE SPARK SESSION
spark = SparkSession \
.builder \
.appName("app") \
.master("<master>") \
.config("spark.cores.max", 4) \
.config("spark.executor.memory", "6g") \
.getOrCreate()
# LOAD THE REQUIRED FILES
modelEnglish = PipelineModel.load(hdfsUrl + "model-english")
ticketEnglishDf = spark.read.parquet(hdfsUrl + "ticket-df-english.parquet").cache()
modelFrench = PipelineModel.load(hdfsUrl + "model-french")
ticketFrenchDf = spark.read.parquet(hdfsUrl + "ticket-df-french.parquet").cache()
def getSuggestions(ticketId, top = 10):
# ...
return ...
@app.route("/suggest/<int:ticketId>")
def suggest(ticketId):
response = {"id": ticketId, "suggestions": getSuggestions(ticketId)}
return jsonify(response)
if __name__ == "__main__":
app.run(debug=True, host="127.0.0.1", port=2793, threaded=True)
This works well, when a request is send to the server. But the Spark job are duplicated... and I don't know why ?
I have already tried to create the spark session inside the condition block if __name__ == "__main__":