How to access mocked s3 bucket via DuckDB

242 Views Asked by At

I have a serverless python code that uses AWS S3, DuckDB API.

The code works fine, the point is the unit tests of this code.

I am using unittest and moto python frameworks in order to test this code.

The issue is to configure duckdb connection to point on the mocked s3 bucket.

This is a code snippet of duckdb configurations:

            self.db_conn.execute(query=f"SET s3_region='{os.environ['AWS_REGION']}';")
            self.db_conn.execute(query=f"SET s3_access_key_id='{self.tenant_ctx_aws_credentials['Credentials']['AccessKeyId']}';")
            self.db_conn.execute(query=f"SET s3_secret_access_key='{self.tenant_ctx_aws_credentials['Credentials']['SecretAccessKey']}';")
            self.db_conn.execute(query=f"SET s3_session_token='{self.tenant_ctx_aws_credentials['Credentials']['SessionToken']}';")
            self.db_conn.execute(query=f"SET memory_limit='{self.memory_limit}';")

And this is from unit test file of the lambda function that uses moto in order to mock AWS services:

@mock_s3
@mock_glue
@mock_ssm
@mock_sts
@mock_iam
@mock.patch.dict(os.environ)
class TestLambdaFunction(unittest.TestCase):

    maxDiff = None

    def setUp(self):
        try:
            # S3 setup:
            self.s3_client = boto3.client('s3', region_name=self.aws_region)
            self.s3_resource = boto3.resource('s3', region_name=self.aws_region)
            self.s3_bucket = self.s3_resource.create_bucket(Bucket=self.dp_s3_bucket_name, CreateBucketConfiguration={
                'LocationConstraint': self.aws_region})

So I am trying that this mocked s3 bucket will be accessed by duckdb.

When I run this test I receive the following error:

IO Error: HTTP GET error on 'https://{test_bucket_name}.s3.amazonaws.com/{s3_path}/test_file.parquet' (HTTP 400)

I tried to work with endpoint_url argument of boto3 client but it did not help.

Moreover, as far as I know when moto runs, it runs on localhost:5000 so I also tried to point duckdb configs to this doamin:port but it also did not worked for me.

1

There are 1 best solutions below

0
On

This is a pytest solution that works with moto[server]. It's not yet available in https://github.com/dazza-codes/pytest-aiomoto but I would welcome a PR on that if I don't get to adding support for duckdb in that project (it's not actively maintained).

@pytest.fixture()
def mock_aws_env(monkeypatch) -> None:
    # first clear everything
    boto3.DEFAULT_SESSION = None
    S3FileSystem.clear_instance_cache()
    monkeypatch.delenv("AWS_CONFIG_FILE", raising=False)
    monkeypatch.delenv("AWS_SHARED_CREDENTIALS_FILE", raising=False)
    monkeypatch.delenv("AWS_PROFILE", raising=False)
    monkeypatch.delenv("AWS_DEFAULT_PROFILE", raising=False)
    monkeypatch.delenv("AWS_ACCOUNT", raising=False)
    monkeypatch.delenv("AWS_ACCESS_KEY_ID", raising=False)
    monkeypatch.delenv("AWS_SECRET_ACCESS_KEY", raising=False)
    monkeypatch.delenv("AWS_SECURITY_TOKEN", raising=False)
    monkeypatch.delenv("AWS_SESSION_TOKEN", raising=False)
    # now set some things
    aws_region = "us-east-1"
    aws_access_key_id = f"AWS_ACCESS_KEY_ID-{uuid4()}"
    aws_secret_access_key = f"AWS_SECRET_ACCESS_KEY-{uuid4()}"
    monkeypatch.setenv("AWS_REGION", aws_region)
    monkeypatch.setenv("AWS_ACCESS_KEY_ID", aws_access_key_id)
    monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", aws_secret_access_key)
    monkeypatch.setenv("AWS_SECURITY_TOKEN", "testing")
    monkeypatch.setenv("AWS_SESSION_TOKEN", "testing")


@pytest.fixture
def mock_s3_server(mock_aws_env, monkeypatch) -> MotoService:
    """
    MotoService("s3")
    """
    with MotoService("s3") as svc:
        svc.reset()
        # duckdb requires a custom ENDPOINT env-var to find moto AWS services
        # https://duckdb.org/docs/extensions/httpfs/s3api.html
        # https://duckdb.org/docs/extensions/httpfs/s3api-legacy-authentication < 0.10
        endpoint_url = urlparse(svc.endpoint_url)
        duckdb_endpoint = f"{endpoint_url.hostname}:{endpoint_url.port}"
        monkeypatch.setenv("DUCKDB_S3_ENDPOINT", duckdb_endpoint)
        monkeypatch.setenv("DUCKDB_S3_USE_SSL", False)
        yield svc
        svc.reset()


#     f"""
#     CREATE SECRET mock_s3_secrets (
#         TYPE S3,
#         KEY_ID {aws_access_key_id},
#         SECRET {aws_secret_access_key},
#         REGION {aws_region},
#         ENDPOINT {moto_endpoint}
#     );
#     """
#     # duckdb < 0.10
#     """
#     SET s3_region = 'us-east-1';
#     SET s3_endpoint = '⟨domain⟩.⟨tld⟩:⟨port⟩';
#     SET s3_use_ssl = false;
#     SET s3_access_key_id = '⟨AWS access key id⟩';
#     SET s3_secret_access_key = '⟨AWS secret access key⟩';
#     """


@pytest.fixture()
def mock_s3_bucket(mock_s3_server, mocker, monkeypatch):
    # create the test bucket since this is all in a moto 'virtual' AWS account
    aws_region = os.getenv("AWS_REGION", "us-east-1")
    s3_data_bucket_name = f"test-app-data-{uuid4()}"

    with mock_s3():
        s3 = boto3.resource("s3", region_name=aws_region, endpoint_url=mock_s3_server.endpoint_url)
        s3_bucket_object = s3.create_bucket(
            Bucket=s3_data_bucket_name,
            ACL="public-read-write",
            CreateBucketConfiguration={"LocationConstraint": aws_region},
        )
        assert s3_bucket_object.name == s3_data_bucket_name
        yield s3_bucket_object

(Caveat, lib imports are needed for ^^)

The MotoService is from https://github.com/dazza-codes/pytest-aiomoto

Also use SET s3_url_style='path'; for duckdb.