3
            import os 
        import sys 
        os.chdir("/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/bin")
        os.curdir
        if 'SPARK_HOME' not in os.environ:
            os.environ['SPARK_HOME'] = '/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7'
        SPARK_HOME = os.environ['SPARK_HOME']
        sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
        sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
        sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
        sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.9-src.zip"))
    from pyspark import SparkContext
    from pyspark import SparkConf

    # Optionally configure Spark Settings
    conf=SparkConf()
    conf.set("spark.executor.memory", "1g")
    conf.set("spark.cores.max", "2")

    conf.setAppName("V2 Maestros")

    ## Initialize SparkContext. Run only once. Otherwise you get multiple 
    #Context Error.
    sc = SparkContext('local', conf=conf)

    #Test to make sure everything works.
    lines=sc.textFile("auto-data.csv")
    lines.count()

This is the error that occurred. It was simple program calculating the number of entering of the file but this error came up. I have kept the file in both the locations mention in the code even though the result is the same.

Py4JJavaError                             Traceback (most recent call last)
<ipython-input-6-5c9242495358> in <module>()
      1 lines = sc.textFile("auto-save.csv")
----> 2 lines.count()

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in count(self)
   1006         3
   1007         """
-> 1008         return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
   1009 
   1010     def stats(self):

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in sum(self)
    997         6.0
    998         """
--> 999         return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
   1000 
   1001     def count(self):

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in fold(self, zeroValue, op)
    871         # zeroValue provided to each partition is unique from the one provided
    872         # to the final reduce call
--> 873         vals = self.mapPartitions(func).collect()
    874         return reduce(op, vals, zeroValue)
    875 

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in collect(self)
    774         """
    775         with SCCallSiteSync(self.context) as css:
--> 776             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
    777         return list(_load_from_socket(port, self._jrdd_deserializer))
    778 

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    931         answer = self.gateway_client.send_command(command)
    932         return_value = get_return_value(
--> 933             answer, self.gateway_client, self.target_id, self.name)
    934 
    935         for temp_arg in temp_args:

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/sql/utils.pyc in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    310                 raise Py4JJavaError(
    311                     "An error occurred while calling {0}{1}{2}.\n".
--> 312                     format(target_id, ".", name), value)
    313             else:
    314                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv
    at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
    at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
    at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:200)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
    at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:53)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:892)
    at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
    at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:280)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:211)
    at java.lang.Thread.run(Thread.java:745)
2
  • I think the error message is pretty self-explanatory. Spark is looking for a file on your local filesystem at /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv and it isn't finding any. In general, it's probably better to use an absolute path in the textFile function. Commented Oct 5, 2016 at 22:27
  • any solution to this problem yet? I am facing the same! Commented Feb 1, 2019 at 11:08

3 Answers 3

2

I faced same error and I solved it. If we configure Spark context with more cores as workers than your system supports. Like I have 3 core system but in my code when I mentioned below code it won't work because I don't have 4th core.

Unsupported Spark Context Configuration code for which I got Py4JJavaerror:

from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Collinear Points").setMaster("local[4]") #Initialize spark context using 4 local cores as workers
sc = SparkContext(conf=conf)    
from pyspark.rdd import RDD

Supported SparkContext Configuration code for all types of systems because in below we are not initializing cores explicitly as workers.

from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Collinear Points")
sc = SparkContext('local',conf=conf)    
from pyspark.rdd import RDD
Sign up to request clarification or add additional context in comments.

Comments

0

You should save your output as

lines=sc.textFile("hdfs:///tmp/auto-data.csv")

or just

lines=sc.textFile("/tmp/auto-data.csv")

This command would write your output to hdfs

Comments

0

The exception is self-explanatory. Try to give absolute path of auto-save.csv to lines=sc.textFile("auto-data.csv") or move the auto-save.csv to /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/

thonRDD.collectAndServe.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.