AWS Glue with Official SFTP Connector

280 Views Asked by At

I'm currently trying to import Files from an SFTP server using AWS Glue with the official SFTP Connector from the Marketplace

I configurated it, gave it a secret as required, and started it. I'm using the visual console to create the glue scripts but here is the generated one:

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)

# Script generated for node SFTP Connector for AWS Glue 3.0
SFTPConnectorforAWSGlue30_node1688135479601 = (
    glueContext.create_dynamic_frame.from_options(
        connection_type="marketplace.spark",
        connection_options={
            "path": "~/haproxy",
            "fileFormat": "text",
            "connectionName": "SFTP-Glue-3",
        },
        transformation_ctx="SFTPConnectorforAWSGlue30_node1688135479601",
    )
)

# Script generated for node S3 bucket
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
    frame=SFTPConnectorforAWSGlue30_node1688135479601,
    connection_type="s3",
    format="json",
    connection_options={"path": "s3://s8-glue-test-bucket2", "partitionKeys": []},
    transformation_ctx="S3bucket_node3",
)

job.commit()

I've tried the Glue 4.0 Component just like I use the Glue 3.0 Component. According to logs it is able to retrieve secrets and all but as soon as it tries to connect some NullPointerException occurs.

Heres the last log line before connection:

2023-06-30T15:54:31.067+02:00   23/06/30 13:54:31 INFO SFTPConnectionPoolV2: Read key file and connect

Here are the Logs:

23/06/30 13:54:31 ERROR ProcessLauncher: Error from Python:Traceback (most recent call last):
  File "/tmp/Sync SFTP.py", line 16, in <module>
    SFTPConnectorforAWSGlue40_node1 = glueContext.create_dynamic_frame.from_options(
  File "/opt/amazon/lib/python3.7/site-packages/awsglue/dynamicframe.py", line 609, in from_options
    return self._glue_context.create_dynamic_frame_from_options(connection_type,
  File "/opt/amazon/lib/python3.7/site-packages/awsglue/context.py", line 232, in create_dynamic_frame_from_options
    source = self.getSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
  File "/opt/amazon/lib/python3.7/site-packages/awsglue/context.py", line 104, in getSource
    j_source = self._ssql_ctx.getSource(connection_type,
  File "/opt/amazon/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(
  File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 190, in deco
    return f(*a, **kw)
  File "/opt/amazon/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o89.getSource.
: java.lang.NullPointerException
    at com.amazonaws.services.glue.marketplace.connector.sftp.SFTPTableProvider.packPath(SFTPTableProvider.java:199)
    at com.amazonaws.services.glue.marketplace.connector.sftp.SFTPTableProvider.getDataSource(SFTPTableProvider.java:136)
    at com.amazonaws.services.glue.marketplace.connector.sftp.SFTPTableProvider.inferSchema(SFTPTableProvider.java:92)
    at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:90)
    at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.loadV2Source(DataSourceV2Utils.scala:132)
    at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:209)
    at scala.Option.flatMap(Option.scala:271)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:207)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:171)
    at com.amazonaws.services.glue.marketplace.connector.CustomDataSourceFactory$.loadSparkDataSource(CustomDataSourceFactory.scala:89)
    at com.amazonaws.services.glue.marketplace.connector.CustomDataSourceFactory$.loadDataSource(CustomDataSourceFactory.scala:33)
    at com.amazonaws.services.glue.GlueContext.getCustomSource(GlueContext.scala:176)
    at com.amazonaws.services.glue.GlueContext.getCustomSourceWithConnection(GlueContext.scala:483)
    at com.amazonaws.services.glue.GlueContext.getSourceInternal(GlueContext.scala:975)
    at com.amazonaws.services.glue.GlueContext.getSource(GlueContext.scala:783)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
    at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
    at java.lang.Thread.run(Thread.java:750)
0

There are 0 best solutions below