CRUD support for list of Dicts

431 Views Asked by At

My goal is to add Weaviate support to the pyLodStorage project Specifically I'd like to use the sample data from:

https://github.com/WolfgangFahl/pyLoDStorage/blob/master/lodstorage/sample.py

Which has

  • a few records of Persons from the Royal family
  • a city list with a few thousand entries
  • an artificial list of records with as many records as you wish

as examples.

All data is tabular. Some basic python types like:

  • str
  • bool
  • int
  • float
  • date
  • datetime

need to be supported.

I created the project http://wiki.bitplan.com/index.php/DgraphAndWeaviateTest and a script to run Weaviate via docker compose. There is a python unit test which used to work with the Weaviate Python client 0.4.1

I am trying to use the information from https://www.semi.technology/documentation/weaviate/current/how-tos/how-to-create-a-schema.html to refactor this unit test but don't know how to do it.

What needs to be done to get the CRUD tests running as e.g. in the other three tests: https://github.com/WolfgangFahl/pyLoDStorage/tree/master/tests for

  • JSON
  • SPARQL
  • SQL

i am especially interested in the "round-trip" handling of list of dicts (aka "Table") with the standard data types above. So I'd like to create a list of dicts and then:

  • derive the schema automatically by looking at some sample records
  • check if the schema already exists and if delete it
  • create the schema
  • check if the data already exits and if delete it
  • add the data and store it
  • optionaly store the schema for further reference
  • restore the data with or without using the schema information

check that the restored data (list of Dicts) is the same as the original data

    Created on 2020-07-24
    
    @author: wf
    '''
    import unittest
    import weaviate
    import time
    #import getpass
    
    class TestWeaviate(unittest.TestCase):
    # https://www.semi.technology/documentation/weaviate/current/client-libs/python.html
    
        def setUp(self):
            self.port=8153
            self.host="localhost"
            #if getpass.getuser()=="wf":
            #    self.host="zeus"
            #    self.port=8080
            pass
        
        def getClient(self):
            self.client=weaviate.Client("http://%s:%d" % (self.host,self.port))
            return self.client
    
        def tearDown(self):
            pass
            
        def testRunning(self):
            '''
            make sure weaviate is running
            '''
            w=self.getClient()
            self.assertTrue(w.is_live())
            self.assertTrue(w.is_ready())
                
    
        def testWeaviateSchema(self):
            ''' see https://www.semi.technology/documentation/weaviate/current/client-libs/python.html '''
            w = self.getClient()
            #contains_schema = w.schema.contains()
            try:
                w.create_schema("https://raw.githubusercontent.com/semi-technologies/weaviate-python-client/master/documentation/getting_started/people_schema.json")
            except:
                pass
            entries=[
               [ {"name": "John von Neumann"}, "Person", "b36268d4-a6b5-5274-985f-45f13ce0c642"],
               [ {"name": "Alan Turing"}, "Person", "1c9cd584-88fe-5010-83d0-017cb3fcb446"],
               [ {"name": "Legends"}, "Group", "2db436b5-0557-5016-9c5f-531412adf9c6" ]
            ]
            for entry in entries:
                dict,type,uid=entry
                try:
                    w.create(dict,type,uid)
                except weaviate.exceptions.ThingAlreadyExistsException as taee:
                    print ("%s already created" % dict['name'])
                
            pass
        
        def testPersons(self):
            return
            w = self.getClient()
    
            schema = {
            "actions": {"classes": [],"type": "action"},
            "things": {"classes": [{
                "class": "Person",
                "description": "A person such as humans or personality known through culture",
                "properties": [
                    {
                        "cardinality": "atMostOne",
                        "dataType": ["text"],
                        "description": "The name of this person",
                        "name": "name"
                    }
                ]}],
                "type": "thing"
            }
            }
            w.create_schema(schema)
            
            w.create_thing({"name": "Andrew S. Tanenbaum"}, "Person")
            w.create_thing({"name": "Alan Turing"}, "Person")
            w.create_thing({"name": "John von Neumann"}, "Person")
            w.create_thing({"name": "Tim Berners-Lee"}, "Person")
            
        def testEventSchema(self):    
            '''
            https://stackoverflow.com/a/63077495/1497139
            '''
            return
            schema = {
              "things": {
                "type": "thing",
                "classes": [
                  {
                    "class": "Event",
                    "description": "event",
                    "properties": [
                      {
                        "name": "acronym",
                        "description": "acronym",
                        "dataType": [
                          "text"
                        ]
                      },
                      {
                        "name": "inCity",
                        "description": "city reference",
                        "dataType": [
                          "City"
                        ],
                        "cardinality": "many"
                      }
                    ]
                  },
                  {
                    "class": "City",
                    "description": "city",
                    "properties": [
                      {
                        "name": "name",
                        "description": "name",
                        "dataType": [
                          "text"
                        ]
                      },
                      {
                        "name": "hasEvent",
                        "description": "event references",
                        "dataType": [
                          "Event"
                        ],
                        "cardinality": "many"
                      }
                    ]
                  }
                ]
              }
            }
    
    
            client = self.getClient()
    
            if not client.contains_schema():
                client.create_schema(schema)
    
            event = {"acronym": "example"}
            client.create(event, "Event", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde")
            city = {"name": "Amsterdam"}
            client.create(city, "City", "c60505f9-8271-4eec-b998-81d016648d85")
    
            time.sleep(2.0)
            client.add_reference("c60505f9-8271-4eec-b998-81d016648d85", "hasEvent", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde")
    
    
    if __name__ == "__main__":
        #import sys;sys.argv = ['', 'Test.testName']
        unittest.main()
2

There are 2 best solutions below

3
On

The unit test for the connection, schema and data objects you show above works like this with the Python client v1.x (see the inline comments for what's changed):

import unittest
import weaviate
import time
#import getpass

class TestWeaviate(unittest.TestCase):
# https://www.semi.technology/documentation/weaviate/current/client-libs/python.html

    def setUp(self):
        self.port=8153
        self.host="localhost"
        #if getpass.getuser()=="wf":
        #    self.host="zeus"
        #    self.port=8080
        pass
    
    def getClient(self):
        self.client=weaviate.Client("http://%s:%d" % (self.host,self.port))
        return self.client

    def tearDown(self):
        pass
        
    def testRunning(self):
        '''
        make sure weaviate is running
        '''
        w=self.getClient()
        self.assertTrue(w.is_live())
        self.assertTrue(w.is_ready())
            

    def testWeaviateSchema(self):
        ''' see https://www.semi.technology/documentation/weaviate/current/client-libs/python.html '''
        w = self.getClient()
        #contains_schema = w.schema.contains()
        try:
            w.schema.create("https://raw.githubusercontent.com/semi-technologies/weaviate-python-client/master/documentation/getting_started/people_schema.json") # instead of w.create_schema, see https://www.semi.technology/documentation/weaviate/current/how-tos/how-to-create-a-schema.html#creating-your-first-schema-with-the-python-client
        except:
            pass
        entries=[
            [ {"name": "John von Neumann"}, "Person", "b36268d4-a6b5-5274-985f-45f13ce0c642"],
            [ {"name": "Alan Turing"}, "Person", "1c9cd584-88fe-5010-83d0-017cb3fcb446"],
            [ {"name": "Legends"}, "Group", "2db436b5-0557-5016-9c5f-531412adf9c6" ]
        ]
        for entry in entries:
            dict,type,uid=entry
            try:
                w.data_object.create(dict,type,uid) # instead of w.create(dict,type,uid), see https://www.semi.technology/documentation/weaviate/current/restful-api-references/semantic-kind.html#example-request-1
            except weaviate.exceptions.ThingAlreadyExistsException as taee:
                print ("%s already created" % dict['name'])
            
        pass
    
    def testPersons(self):
        return
        w = self.getClient()

        schema = {
        "actions": {"classes": [],"type": "action"},
        "things": {"classes": [{
            "class": "Person",
            "description": "A person such as humans or personality known through culture",
            "properties": [
                {
                    "cardinality": "atMostOne",
                    "dataType": ["text"],
                    "description": "The name of this person",
                    "name": "name"
                }
            ]}],
            "type": "thing"
        }
        }
        w.schema.create(schema) # instead of w.create_schema(schema)
        
        w.data_object.create({"name": "Andrew S. Tanenbaum"}, "Person") # instead of  w.create_thing({"name": "Andrew S. Tanenbaum"}, "Person")
        w.data_object.create({"name": "Alan Turing"}, "Person")
        w.data_object.create({"name": "John von Neumann"}, "Person")
        w.data_object.create({"name": "Tim Berners-Lee"}, "Person")
        
    def testEventSchema(self):    
        '''
        https://stackoverflow.com/a/63077495/1497139
        '''
        return
        schema = {
            "things": {
            "type": "thing",
            "classes": [
                {
                "class": "Event",
                "description": "event",
                "properties": [
                    {
                    "name": "acronym",
                    "description": "acronym",
                    "dataType": [
                        "text"
                    ]
                    },
                    {
                    "name": "inCity",
                    "description": "city reference",
                    "dataType": [
                        "City"
                    ],
                    "cardinality": "many"
                    }
                ]
                },
                {
                "class": "City",
                "description": "city",
                "properties": [
                    {
                    "name": "name",
                    "description": "name",
                    "dataType": [
                        "text"
                    ]
                    },
                    {
                    "name": "hasEvent",
                    "description": "event references",
                    "dataType": [
                        "Event"
                    ],
                    "cardinality": "many"
                    }
                ]
                }
            ]
            }
        }


        client = self.getClient()

        if not client.contains_schema():
            client.schema.create(schema) # instead of client.create_schema(schema)

        event = {"acronym": "example"}
        client.data_object.create(event, "Event", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde") # instead of client.create(event, "Event", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde")
        city = {"name": "Amsterdam"}
        client.data_object.create(city, "City", "c60505f9-8271-4eec-b998-81d016648d85")

        time.sleep(2.0)
        client.data_object.reference.add("c60505f9-8271-4eec-b998-81d016648d85", "hasEvent", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde") # instead of client.add_reference("c60505f9-8271-4eec-b998-81d016648d85", "hasEvent", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde"), see https://www.semi.technology/documentation/weaviate/current/restful-api-references/semantic-kind.html#add-a-cross-reference


if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()

There's no support for automatically deriving a schema from a list of dict (or other formats) yet. This could, as you mention, be a good convenience feature, so we add this to Weaviate's feature suggestions!

0
On

The new version of Weaviate is now available (v1.2.1 is the latest release at the time of writing this). With this version a lot of things were removed and even more added. One of the major breaking change is that actions and things were removed, objects were introduced instead. All the changes and features for weaviate v1.2 can be used with weaviate-client python library v2.3.

Most of the current weaviate-client functionality is explained and showed how it works in this article.

Here is the same unittests but for Weaviate v1.2.1 and written using weaviate-client v2.3.1:

import unittest
import weaviate
import time
#import getpass

person_schema = {
    "classes": [
    {
    "class": "Person",
    "description": "A person such as humans or personality known through culture",
    "properties": [
        {
        "name": "name",
        "description": "The name of this person",
        "dataType": ["text"]
        }
    ]
    },
    {
    "class": "Group",
    "description": "A set of persons who are associated with each other over some common properties",
    "properties": [
        {
        "name": "name",
        "description": "The name under which this group is known",
        "dataType": ["text"]
        },
        {
        "name": "members",
        "description": "The persons that are part of this group",
        "dataType": ["Person"]
        }
    ]
    }
]
}



class TestWeaviate(unittest.TestCase):
    # NEW link to the page
    # https://www.semi.technology/developers/weaviate/current/client-libraries/python.html

    def setUp(self):
        self.port=8080
        self.host="localhost"
        #if getpass.getuser()=="wf":
        #    self.host="zeus"
        #    self.port=8080
        pass
    
    def getClient(self):
        self.client=weaviate.Client("http://%s:%d" % (self.host,self.port))
        return self.client

    def tearDown(self):
        pass
        
    def testRunning(self):
        '''
        make sure weaviate is running
        '''
        w=self.getClient()
        self.assertTrue(w.is_live())
        self.assertTrue(w.is_ready())
            

    def testWeaviateSchema(self):
        # NEW link to the page
        # https://www.semi.technology/developers/weaviate/current/client-libraries/python.html
        w = self.getClient()
        #contains_schema = w.schema.contains()

        # it is a good idea to check if Weaviate has a schema already when testing, otherwise it will result in an error
        # this way you know for sure that your current schema is known to weaviate.

        if w.schema.contains():
            # delete the existing schema, (removes all the data objects too)
            w.schema.delete_all()
        # instead of w.create_schema(person_schema)
        w.schema.create(person_schema)
        entries=[
            [ {"name": "John von Neumann"}, "Person", "b36268d4-a6b5-5274-985f-45f13ce0c642"],
            [ {"name": "Alan Turing"}, "Person", "1c9cd584-88fe-5010-83d0-017cb3fcb446"],
            [ {"name": "Legends"}, "Group", "2db436b5-0557-5016-9c5f-531412adf9c6" ]
        ]
        for entry in entries:
            dict,type,uid=entry
            try:
                # instead of w.create(dict,type,uid), see https://www.semi.technology/developers/weaviate/current/restful-api-references/objects.html#create-a-data-object
                w.data_object.create(dict,type,uid)
            # ObjectAlreadyExistsException is the correct exception starting weaviate-client 2.0.0
            except weaviate.exceptions.ObjectAlreadyExistsException as taee: 
                print ("%s already created" % dict['name'])
            
        pass
    
    def testPersons(self):
        return
        w = self.getClient()

        schema = {
        #"actions": {"classes": [],"type": "action"}, `actions` and `things` were removed in weaviate v1.0 and removed in weaviate-client v2.0
        # Now there is only `objects`
        "classes": [
            {
            "class": "Person",
            "description": "A person such as humans or personality known through culture",
            "properties": [
                {
                    #"cardinality": "atMostOne", were removed in weaviate v1.0 and weaviate-client v2.0
                    "dataType": ["text"],
                    "description": "The name of this person",
                    "name": "name"
                }
            ]
            }
            ]
        }
        # instead of w.create_schema(schema)
        w.schema.create(schema) 
        
        # instead of  w.create_thing({"name": "Andrew S. Tanenbaum"}, "Person")
        w.data_object.create({"name": "Andrew S. Tanenbaum"}, "Person")
        w.data_object.create({"name": "Alan Turing"}, "Person")
        w.data_object.create({"name": "John von Neumann"}, "Person")
        w.data_object.create({"name": "Tim Berners-Lee"}, "Person")
        
    def testEventSchema(self):    
        '''
        https://stackoverflow.com/a/63077495/1497139
        '''
        return
        schema = {
            # "things": { , were removed in weaviate v1.0 and weaviate-client v2.0
            # "type": "thing", was removed in weaviate v1.0 and weaviate-client v2.0
            "classes": [
                {
                "class": "Event",
                "description": "event",
                "properties": [
                    {
                    "name": "acronym",
                    "description": "acronym",
                    "dataType": [
                        "text"
                    ]
                    },
                    {
                    "name": "inCity",
                    "description": "city reference",
                    "dataType": [
                        "City"
                    ],
                    # "cardinality": "many", were removed in weaviate v1.0 and weaviate-client v2.0
                    }
                ]
                },
                {
                "class": "City",
                "description": "city",
                "properties": [
                    {
                    "name": "name",
                    "description": "name",
                    "dataType": [
                        "text"
                    ]
                    },
                    {
                    "name": "hasEvent",
                    "description": "event references",
                    "dataType": [
                        "Event"
                    ],
                    # "cardinality": "many", were removed in weaviate v1.0 and weaviate-client v2.0
                    }
                ]
                }
            ]
        }


        client = self.getClient()

        # this test is going to fail if you are using the same Weaviate instance
        # We already created a schema in the test above so the new schme is not going to be created
        # and will result in an error.
        # we can delete the schema and create a new one.
        
        # instead of client.contains_schema()
        if client.schema.contains():
            # delete the existing schema, (removes all the data objects too)
            client.schema.delete_all()
        # instead of client.create_schema(schema)
        client.schema.create(schema)

        event = {"acronym": "example"}
        # instead of client.create(...)
        client.data_object.create(event, "Event", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde")
        city = {"name": "Amsterdam"}
        client.data_object.create(city, "City", "c60505f9-8271-4eec-b998-81d016648d85")

        time.sleep(2.0)
        # instead of client.add_reference(...), see https://www.semi.technology/developers/weaviate/current/restful-api-references/objects.html#cross-references
        client.data_object.reference.add("c60505f9-8271-4eec-b998-81d016648d85", "hasEvent", "2a8d56b7-2dd5-4e68-aa40-53c9196aecde")


if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()