Getting unexpected results in matching event names using BERT embeddings

16 Views Asked by At

I'm trying to match event names using LLM BERT embeddings.

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from scipy.spatial.distance import cosine

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Encode a word into its vector representation using BERT
def get_word_embedding(word):
    tokens = tokenizer(word, return_tensors="pt")
    outputs = model(**tokens)
    word_embedding = np.mean(outputs.last_hidden_state.detach().numpy(), axis=1)
    return word_embedding

# Calculate the cosine similarity between two word embeddings
def calculate_similarity(word1, word2):
    embedding1 = get_word_embedding(word1)
    embedding2 = get_word_embedding(word2)
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity

word1 = "17 May Constitution Day Off"
word2 = "Constitution Day Off"
similarity_score = calculate_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")

Similarity between '17 May Constitution Day Off' and 'Constitution Day Off': 0.8122310638427734

word1 = "Ascension Day Off"
word2 = "Constitution Day Off"
similarity_score = calculate_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")

Similarity between 'Ascension Day Off' and 'Constitution Day Off': 0.8288278579711914

Speaking from Human perspective, the holidays wrt constitution off should have higher contextual similarity. Any idea on how I can transform the input to derive from realistic results?

0

There are 0 best solutions below