# Import necessary libraries
import great_expectations as ge
import datetime
# Load the Great Expectations context
context = ge.data_context.DataContext("../.")
# Load the JSON data into a Pandas DataFrame
data_file_path = "../../data/nested.json"
df = ge.read_json(data_file_path)
# Create a batch of data
batch = ge.dataset.PandasDataset(df)
# Create new columns for nested values
batch["details_age"] = batch["details"].apply(lambda x: x.get("age"))
batch["details_address_city"] = batch["details"].apply(lambda x: x.get("address").get("city"))
batch["details_address_state"] = batch["details"].apply(lambda x: x.get("address").get("state"))
# Load the expectation suite
expectation_suite_name = 'nestedjson_expectations_suite'
suite = context.get_expectation_suite(expectation_suite_name)
# Validate the batch against the expectation suite
results = context.run_validation_operator(
"action_list_operator",
assets_to_validate=[batch],
run_name = "abcd1",
run_time = datetime.datetime.now(datetime.timezone.utc),
)
# Print the validation results
print(results)
context.build_data_docs()
context.open_data_docs(resource_identifier=results.list_validation_result_identifiers()[0])
I am trying to work with a nested json for which I need to flatten the json to be able to work with. In the code above as you can see that I have modified the batch but I'm unsure how to pass this batch to the validator alongside my expectation suite.
Looking at the code doc here run_validation_operator expects assets_to_validate that can either be a list of batches (which I am already trying)
assets_to_validate: a list that specifies the data assets that the operator will validate.
The members of the list can be either batches, or a tuple that will allow the operator to fetch
the batch: (batch_kwargs, expectation_suite_name)
Where batch_kwargs is keyword arguments used to request a batch directly from a Datasource.
How am I suppose to pass custom batch alongside expectation suite to validate against?
Alternatively As an alternative I tested my batch without expectation suite like so:
# Import necessary libraries
import great_expectations as ge
import datetime
# Load the Great Expectations context
context = ge.data_context.DataContext("../.")
# Load the JSON data into a Pandas DataFrame
data_file_path = "../../data/nested.json"
df = ge.read_json(data_file_path)
# Create a batch of data
batch = ge.dataset.PandasDataset(df)
# Create new columns for nested values
batch["details_age"] = batch["details"].apply(lambda x: x.get("age"))
batch["details_address_city"] = batch["details"].apply(lambda x: x.get("address").get("city"))
batch["details_address_state"] = batch["details"].apply(lambda x: x.get("address").get("state"))
# Define expectations for the 'id' column
batch.expect_column_values_to_be_between('id', min_value=1, max_value=100)
batch.expect_column_values_to_be_unique('id')
# Define expectations for the 'name' column
batch.expect_column_values_to_match_regex('name', r'^[A-Za-z\s]+$')
batch.expect_column_values_to_not_be_null('name')
# Define expectations for nested fields
batch.expect_column_values_to_be_between('details_age', min_value=0, max_value=120)
batch.expect_column_values_to_match_regex('details_address_city', r'^[A-Za-z\s]+$')
batch.expect_column_values_to_match_regex('details_address_state', r'^[A-Za-z\s]+$')
# # Load the expectation suite
# expectation_suite_name = 'nestedjson_expectations_suite'
# suite = context.get_expectation_suite(expectation_suite_name)
# Validate the batch against the expectation suite
results = context.run_validation_operator(
"action_list_operator",
assets_to_validate=[batch],
run_name = "abcd1",
run_time = datetime.datetime.now(datetime.timezone.utc),
)
# Print the validation results
print(results)
context.build_data_docs()
context.open_data_docs(resource_identifier=results.list_validation_result_identifiers()[0])