I customized the bitnami's jupyterhub package to add python and R packages to the jupyter-base-notebook image. There's below the custom dockerfile :
FROM bitnami/jupyter-base-notebook
USER root
# Install build essentials
RUN apt-get update && \
apt-get install -y build-essential && \
apt-get clean
# Install git, java, R and necessary libraries for Jupyter R kernel
RUN apt update && \
apt install -y --no-install-recommends git-all r-base openjdk-17-jre-headless
# Install required python libraries
RUN pip install \
'pyspark' \
'pyspark[sql]' \
'pyspark[connect]' \
'jedi' \
'jupyterlab-lsp' \
'python-lsp-server[all]' \
'pylsp-rope' \
'pandas' \
'mlflow' \
'delta-spark' \
'jupyterlab-git'
# Install Jupyter R kernel
RUN R -e 'install.packages("IRkernel")'
RUN R -e 'IRkernel::installspec(user = FALSE)'
# Install R packages
RUN R -e 'install.packages("languageserver")'
# Set JAVA_HOME environment variable
ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
# Append JAVA_HOME to /etc/environment
RUN echo "JAVA_HOME=${JAVA_HOME}" >> /etc/environment \
&& echo 'export JAVA_HOME' >> /etc/environment
USER 1001
My issue is that pyspark doesn't work, as it shows below I ran this snippet in the notebook and I got this exception :
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Create a SparkSession
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()
Exception in thread "main" java.lang.IllegalArgumentException: basedir must be absolute: ?/.ivy2/local
at org.apache.ivy.util.Checks.checkAbsolute(Checks.java:48)
at org.apache.ivy.plugins.repository.file.FileRepository.setBaseDir(FileRepository.java:137)
at org.apache.ivy.plugins.repository.file.FileRepository.<init>(FileRepository.java:44)
at org.apache.spark.deploy.SparkSubmitUtils$.createRepoResolvers(SparkSubmit.scala:1269)
at org.apache.spark.deploy.SparkSubmitUtils$.buildIvySettings(SparkSubmit.scala:1376)
at org.apache.spark.util.DependencyUtils$.resolveMavenDependencies(DependencyUtils.scala:182)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:334)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:964)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:194)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:217)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1120)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1129)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
---------------------------------------------------------------------------
PySparkRuntimeError Traceback (most recent call last)
Cell In[1], line 5
2 from pyspark.sql.functions import col
4 # Create a SparkSession
----> 5 spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()
File /opt/bitnami/miniconda/lib/python3.8/site-packages/pyspark/sql/session.py:497, in SparkSession.Builder.getOrCreate(self)
495 sparkConf.set(key, value)
496 # This SparkContext may be an existing one.
--> 497 sc = SparkContext.getOrCreate(sparkConf)
498 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
499 # by all sessions.
500 session = SparkSession(sc, options=self._options)
File /opt/bitnami/miniconda/lib/python3.8/site-packages/pyspark/context.py:515, in SparkContext.getOrCreate(cls, conf)
513 with SparkContext._lock:
514 if SparkContext._active_spark_context is None:
--> 515 SparkContext(conf=conf or SparkConf())
516 assert SparkContext._active_spark_context is not None
517 return SparkContext._active_spark_context
File /opt/bitnami/miniconda/lib/python3.8/site-packages/pyspark/context.py:201, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls, memory_profiler_cls)
195 if gateway is not None and gateway.gateway_parameters.auth_token is None:
196 raise ValueError(
197 "You are trying to pass an insecure Py4j gateway to Spark. This"
198 " is not allowed as it is a security risk."
199 )
--> 201 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
202 try:
203 self._do_init(
204 master,
205 appName,
(...)
215 memory_profiler_cls,
216 )
File /opt/bitnami/miniconda/lib/python3.8/site-packages/pyspark/context.py:436, in SparkContext._ensure_initialized(cls, instance, gateway, conf)
434 with SparkContext._lock:
435 if not SparkContext._gateway:
--> 436 SparkContext._gateway = gateway or launch_gateway(conf)
437 SparkContext._jvm = SparkContext._gateway.jvm
439 if instance:
File /opt/bitnami/miniconda/lib/python3.8/site-packages/pyspark/java_gateway.py:107, in launch_gateway(conf, popen_kwargs)
104 time.sleep(0.1)
106 if not os.path.isfile(conn_info_file):
--> 107 raise PySparkRuntimeError(
108 error_class="JAVA_GATEWAY_EXITED",
109 message_parameters={},
110 )
112 with open(conn_info_file, "rb") as info:
113 gateway_port = read_int(info)
PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.
I tried to setup SPARK_HOME in the jupyter-base-notebook dockerfile, without success:
ENV SPARK_HOME=/opt/bitnami/miniconda/lib/python3.8/site-packages/pyspark/
ENV PATH=$PATH:$SPARK_HOME/bin