When I want to add a new point to the collection, should I create it from scratch?

111 Views Asked by At

I started using qdrant today, I am very new to this subject.

I am working on similarity search in images and I added my images to qdrant, but later when I want to add other images to my database, it deletes the existing one and creates a new database, so I lose my old images.

Can I add as many images as I want to my existing collection at any time, can you help me with this? I share my code with you. Thank you for your interest

class ImageEmbedding:

def image_to_database(self):

    base_directory = "Images"
    all_image_urls = os.listdir(base_directory)

    sample_image_urls = all_image_urls
    sample_image_urls = list(map(lambda item: f"{base_directory}/{item}",sample_image_urls))

    payloads = DataFrame.from_records({"image_url": sample_image_urls})
    payloads["model_id"] = 2

    images = list(map(lambda el:Image.open(el),payloads["image_url"]))

    target_width = 256

    def resize_image(image_url):
        pil_image = Image.open(image_url)
        image_aspect_ratio = pil_image.width / pil_image.height
        resized_pil_image = pil_image.resize([target_width,math.floor(target_width * image_aspect_ratio)])
        return resized_pil_image

    def convert_image_to_base64(pil_image):
        image_data = BytesIO()
        pil_image.save(image_data,format="JPEG")
        base64_string = base64.b64encode(image_data.getvalue()).decode("utf-8")
        return base64_string

    resized_images = list(map(lambda el: resize_image(el), sample_image_urls))
    base64_strings = list(map(lambda el: convert_image_to_base64(el), resized_images))
    payloads["base64"] = base64_strings

    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    inputs = processor(images,return_tensors="pt",)
    outputs = model(**inputs)
    embeddings = outputs.logits

    embedding_length = len(embeddings[0])

    load_dotenv()

    qclient = QdrantClient(
        url = os.getenv('QDRANT_DB_URL'),
        api_key = os.getenv('QDRANT_API_KEY'),
    )

    collection_name = "die_models_images"

    collection = qclient.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=embedding_length,
            distance=Distance.COSINE
        )
    )

    payload_dicts = payloads.to_dict(orient="records")

    records = [
        models.Record(
            id=idx,
            payload=payload_dicts[idx],
            vector=embeddings[idx]
        )
        for idx, _ in  enumerate(payload_dicts)
    ]

    qclient.upload_records(
        collection_name=collection_name,
        records=records
    )  
2

There are 2 best solutions below

2
On

You're using recreate_collection, which will delete and create the specified collection every time it is invoked.

Instead, you probably want to use create_collection and only call it once:

collection = qclient.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=embedding_length,
        distance=Distance.COSINE
    )
)

You must also make sure to use unique IDs for each record you insert.

0
On

You are using recreate_collection instead of create_collection. Do not call the create_collection multiple times or wrap it in a try except statement as a work around because it will throw an exception if a collection with that name already exists. I would suggest you do it this way :

import datetime

class ImageEmbedding:

def image_to_database(self):
    
    base_directory = "Images"
    all_image_urls = os.listdir(base_directory)
    
    sample_image_urls = all_image_urls
    sample_image_urls = list(map(lambda item: f"{base_directory}/{item}", sample_image_urls))
    
    payloads = DataFrame.from_records({"image_url": sample_image_urls})
    payloads["model_id"] = 2
    
    images = list(map(lambda el:Image.open(el), payloads["image_url"]))
    
    target_width = 256
    
    def resize_image(image_url):
        pil_image = Image.open(image_url)
        image_aspect_ratio = pil_image.width / pil_image.height
        resized_pil_image = pil_image.resize([target_width, math.floor(target_width * image_aspect_ratio)])
        return resized_pil_image
    
    def convert_image_to_base64(pil_image):
        image_data = BytesIO()
        pil_image.save(image_data, format="JPEG")
        base64_string = base64.b64encode(image_data.getvalue()).decode("utf-8")
        return base64_string
    
    resized_images = list(map(lambda el: resize_image(el), sample_image_urls))
    base64_strings = list(map(lambda el: convert_image_to_base64(el), resized_images))
    payloads["base64"] = base64_strings
    
    processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
    
    inputs = processor(images, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.logits
    
    embedding_length = len(embeddings[0])
    
    qclient = QdrantClient(
        url=os.getenv('QDRANT_DB_URL'),
        api_key=os.getenv('QDRANT_API_KEY'),
    )
    
    collection_name = "die_models_images"
    
    collection = qclient.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=embedding_length,
            distance=Distance.COSINE
        )
    )
    
    payload_dicts = payloads.to_dict(orient="records")
    
    records = []
    for idx, _ in enumerate(payload_dicts):
        unique_id = int(str(idx) + datetime.now().strftime("%Y%m%d%H%M%S"))
        record = models.Record(
            id=unique_id,
            payload=payload_dicts[idx],
            vector=embeddings[idx]
        )
        records.append(record)
    
    qclient.upload_records(
        collection_name=collection_name,
        records=records
    )