I'm currently trying to import Files from an SFTP server using AWS Glue with the official SFTP Connector from the Marketplace
I configurated it, gave it a secret as required, and started it. I'm using the visual console to create the glue scripts but here is the generated one:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node SFTP Connector for AWS Glue 3.0
SFTPConnectorforAWSGlue30_node1688135479601 = (
glueContext.create_dynamic_frame.from_options(
connection_type="marketplace.spark",
connection_options={
"path": "~/haproxy",
"fileFormat": "text",
"connectionName": "SFTP-Glue-3",
},
transformation_ctx="SFTPConnectorforAWSGlue30_node1688135479601",
)
)
# Script generated for node S3 bucket
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
frame=SFTPConnectorforAWSGlue30_node1688135479601,
connection_type="s3",
format="json",
connection_options={"path": "s3://s8-glue-test-bucket2", "partitionKeys": []},
transformation_ctx="S3bucket_node3",
)
job.commit()
I've tried the Glue 4.0 Component just like I use the Glue 3.0 Component.
According to logs it is able to retrieve secrets and all but as soon as it tries to connect some NullPointerException
occurs.
Heres the last log line before connection:
2023-06-30T15:54:31.067+02:00 23/06/30 13:54:31 INFO SFTPConnectionPoolV2: Read key file and connect
Here are the Logs:
23/06/30 13:54:31 ERROR ProcessLauncher: Error from Python:Traceback (most recent call last):
File "/tmp/Sync SFTP.py", line 16, in <module>
SFTPConnectorforAWSGlue40_node1 = glueContext.create_dynamic_frame.from_options(
File "/opt/amazon/lib/python3.7/site-packages/awsglue/dynamicframe.py", line 609, in from_options
return self._glue_context.create_dynamic_frame_from_options(connection_type,
File "/opt/amazon/lib/python3.7/site-packages/awsglue/context.py", line 232, in create_dynamic_frame_from_options
source = self.getSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
File "/opt/amazon/lib/python3.7/site-packages/awsglue/context.py", line 104, in getSource
j_source = self._ssql_ctx.getSource(connection_type,
File "/opt/amazon/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 190, in deco
return f(*a, **kw)
File "/opt/amazon/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o89.getSource.
: java.lang.NullPointerException
at com.amazonaws.services.glue.marketplace.connector.sftp.SFTPTableProvider.packPath(SFTPTableProvider.java:199)
at com.amazonaws.services.glue.marketplace.connector.sftp.SFTPTableProvider.getDataSource(SFTPTableProvider.java:136)
at com.amazonaws.services.glue.marketplace.connector.sftp.SFTPTableProvider.inferSchema(SFTPTableProvider.java:92)
at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:90)
at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.loadV2Source(DataSourceV2Utils.scala:132)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:209)
at scala.Option.flatMap(Option.scala:271)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:207)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:171)
at com.amazonaws.services.glue.marketplace.connector.CustomDataSourceFactory$.loadSparkDataSource(CustomDataSourceFactory.scala:89)
at com.amazonaws.services.glue.marketplace.connector.CustomDataSourceFactory$.loadDataSource(CustomDataSourceFactory.scala:33)
at com.amazonaws.services.glue.GlueContext.getCustomSource(GlueContext.scala:176)
at com.amazonaws.services.glue.GlueContext.getCustomSourceWithConnection(GlueContext.scala:483)
at com.amazonaws.services.glue.GlueContext.getSourceInternal(GlueContext.scala:975)
at com.amazonaws.services.glue.GlueContext.getSource(GlueContext.scala:783)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)