I am new to the Apache Spark environment and am trying to create a test setup. Namely, I am using a NATS Jetstream server as a message broker for my IoT messages. I found online this github repo (from NATS itself) with a Scala connector for Spark. (repo: https://github.com/nats-io/nats-spark-connector/tree/main/load_balanced) According to the readme, the connector could be used as follows:
val spark = SparkSession
.builder()
.master("spark://localhost:7077")
.appName("NatsReaderTest")
.config("spark.logConf", "false")
.config("spark.jars",
"/some_path_to_the_connector_jar/nats-spark-connector_2.12-0.1.jar,"
+ "/some_path_to_the_Nats_jar/jnats-2.13.2.jar"
)
.config("spark.executor.instances", "2")
.config("spark.cores.max", "4")
.config("spark.executor.memory", "2g")
.getOrCreate()
val initDF = spark
.readStream
.format("nats")
.option("nats.host", host)
.option("nats.port", port)
.option("nats.stream.name", "TestStream")
.option("nats.stream.subjects", "subject1, subject2")
// wait 90 seconds for an ack before resending a message
.option("nats.msg.ack.wait.secs", 90)
.option("nats.num.listeners", 2)
// Each listener will fetch 10 messages at a time
.option("nats.msg.fetch.batch.size", 10)
.load()
However, I would like to use PySpark since my colleagues don't know java or scala. Is a similar way possible? Can anyone help me with a simple test example?
(Or does anyone have a good idea how to use PySpark with NATS?)
I am using a Spark-jupyter container in docker as environment.
I have tried something very similar, but get stuck on errors
spark = SparkSession \
.builder \
.master("spark://localhost:4040") \
.appName("Nats Jetstream Concept") \
.config("spark.logConf", "true") \
.config("spark.jars",
"./Nats_Spark_Connector_Scala/nats-spark-connector/load_balanced/target/scala-2.12/nats-spark-connector-balanced-assembly-1.2.2.jar,"
+"./Nats_Spark_Connector_Scala/jnats-2.13.2.jar") \
.config("spark.executor.instances", "1") \
.config("spark.executor.memory", "1g") \
.config("spark.cores.max", "1") \
.getOrCreate()
initDF = spark.readStream.format("nats") \
.option("nats.host", "localhost") \
.option("nats.port", "4222") \
.option("nats.stream.name", "CalculationStream") \
.option("nats.stream.subjects", "calc.calcId123") \
.option("nats.msg.ack.wait.secs", 90) \
.option("nats.num.listeners", 1) \
.option("nats.msg.fetch.batch.size", 1) \
.load()
initDF.writeStream \
.format("console") \
.start() \
.awaitTermination()
Py4JJavaError Traceback (most recent call last)
Cell In[39], line 10
1 # initDF.writeStream \
2 # .format("nats") \
3 # .option("checkpointLocation", "Users/spark_checkpoint") \
4 # .option("nats.host", "0.0.0.0") \
5 # .option("nats.port", "4222") \
6 # .start
8 initDF.writeStream \
9 .format("console") \
---> 10 .start() \
11 .awaitTermination()
File /opt/conda/envs/vintecc/lib/python3.11/site-packages/pyspark/sql/streaming/readwriter.py:1527, in DataStreamWriter.start(self, path, format, outputMode, partitionBy, queryName, **options)
1525 self.queryName(queryName)
1526 if path is None:
-> 1527 return self._sq(self._jwrite.start())
1528 else:
1529 return self._sq(self._jwrite.start(path))
File /opt/conda/envs/vintecc/lib/python3.11/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File /opt/conda/envs/vintecc/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py:179, in capture_sql_exception.<locals>.deco(*a, **kw)
177 def deco(*a: Any, **kw: Any) -> Any:
178 try:
--> 179 return f(*a, **kw)
180 except Py4JJavaError as e:
181 converted = convert_exception(e.java_exception)
File /opt/conda/envs/vintecc/lib/python3.11/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o332.start.
: java.lang.IllegalStateException: RpcEnv has been stopped
at org.apache.spark.rpc.netty.Dispatcher.registerRpcEndpoint(Dispatcher.scala:60)
at org.apache.spark.rpc.netty.NettyRpcEnv.setupEndpoint(NettyRpcEnv.scala:136)
at org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef$.forDriver(StateStoreCoordinator.scala:72)
at org.apache.spark.sql.streaming.StreamingQueryManager.<init>(StreamingQueryManager.scala:52)
at org.apache.spark.sql.internal.BaseSessionStateBuilder.streamingQueryManager(BaseSessionStateBuilder.scala:339)
at org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$build$4(BaseSessionStateBuilder.scala:377)
at org.apache.spark.sql.internal.SessionState.streamingQueryManager$lzycompute(SessionState.scala:100)
at org.apache.spark.sql.internal.SessionState.streamingQueryManager(SessionState.scala:100)
at org.apache.spark.sql.streaming.DataStreamWriter.startQuery(DataStreamWriter.scala:422)
at org.apache.spark.sql.streaming.DataStreamWriter.startInternal(DataStreamWriter.scala:410)
at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:251)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)