I have recently switched from using Sedona 1.3.1 to 1.4.0.

Previously, the output of the spatial join included all columns from both input SparkDataFrames. However, now the output only includes the geometry columns from each SparkDataFrame.

I want to be able to spatially join two SparkDataFrames and convert the output to a new SparkDataFrame that has all column of both input SparkDataFrames.

In the example below I create two SparkDataFrames and join them with Sedona. The output SparkDataFrame only has two geometry columns.

How can I change this join so that it returns a spark data frame with all columns from both input spark data frames?

import itertools
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely import from_wkt

from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from pyspark.sql import functions as F
from sedona.utils.adapter import Adapter
from sedona.core.spatialOperator import JoinQueryRaw

import matplotlib.pyplot as plt

spark = SparkSession.builder.getOrCreate()
SedonaRegistrator.registerAll(spark)

# Define spatal data frames
coords = zip(np.random.choice(range(10), 10), np.random.choice(range(10), 10))
points = [ Point(i) for i in coords]
circles = [p.buffer(2) for p in points]
circles_wkt = [c.wkt for c in circles]

gdf1 = gpd.GeoDataFrame({"id1":range(5), "wkt": circles_wkt[:5], "geometry":circles[:5]}, geometry="geometry")
gdf2 = gpd.GeoDataFrame({"id2":range(5,10), "wkt": circles_wkt[5:], "geometry":circles[5:]}, geometry="geometry")

# convert to spark dataframes
sdf1 = spark.createDataFrame(gdf1[['wkt', 'id1']])
sdf2 = spark.createDataFrame(gdf2[['wkt', 'id2']])

sdf1 = sdf1.withColumn("geometry", F.expr("ST_GeomFromWKT(wkt)"))
sdf2 = sdf2.withColumn("geometry", F.expr("ST_GeomFromWKT(wkt)"))

#
# now use Sedona API to perform spatial join
#

# convert to RDD
rdd_left = Adapter.toSpatialRdd(sdf1, "geometry")
rdd_right = Adapter.toSpatialRdd(sdf2, "geometry")
rdd_left.analyze()
rdd_right.analyze()

# join
rdd_left.spatialPartitioning(partitioning="KDBTREE", num_partitions=1)
rdd_right.spatialPartitioning(rdd_left.getPartitioner())


result_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(
    rdd_right,
    rdd_left,
    useIndex=False,
    considerBoundaryIntersection=False
)

# Convert to DF - dataframe only has 'leftgeometry' and 'rightgeometry' columns
df = Adapter.toDf(result_pair_rdd, spark)
df.columns
0

There are 0 best solutions below