FIle-1 FILE-2 Name Age Name Age Hiites 21 Hitesh 21 Hardick 11 Hardik 11 Rajes 48 Rajesh 48 Snha 47 Sneha 47
Here i want to match the names and get the best match. below is the code which i have used and i am getting the following error:-
import pandas as pd
from pandas import DataFrame
from fuzzywuzzy import process
import csv
save_file = open('fuzzy_match_results.csv', 'w')
writer = csv.writer(save_file, lineterminator = '\n')
def parse_csv(path):
with open(path,'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
yield row
if __name__ == "__main__":
## Create lookup dictionary by parsing the products csv
data = {}
for row in parse_csv('file1.csv'):
data[row[0]] = row[0]
## For each row in the lookup compute the partial ratio
for row in parse_csv("file2.csv"):
for found, score, matchrow in process.extractOne(row, data, score_cutoff = 60):
if score >= 60:
print('%d%% partial match: "%s" with "%s" ' % (score, row, found))
Digi_Results = [row, score, found]
writer.writerow(Digi_Results)
save_file.close()
Below is the error:-
File "script.py", line 26, in <module>
for found, score, matchrow in process.extractOne(row, data, score_cutoff = 60):
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/process.py", line 245, in extractOne
return max(best_list, key=lambda i: i[1])
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/process.py", line 103, in extractWithoutOrder
processed_query = processor(query)
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/utils.py", line 89, in full_process
string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/string_processing.py", line 26, in replace_non_letters_non_numbers_with_whitespace
return cls.regex.sub(" ", a_string)
TypeError: expected string or buffer
hitesh@hitesh-VGN-CS25GN-B:~/Desktop$ python script.py
/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/fuzz.py:35: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
Traceback (most recent call last):
File "script.py", line 26, in <module>
for found, score, matchrow in process.extractOne(row, data, score_cutoff = 60):
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/process.py", line 245, in extractOne
return max(best_list, key=lambda i: i[1])
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/process.py", line 103, in extractWithoutOrder
processed_query = processor(query)
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/utils.py", line 89, in full_process
string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
File "/usr/local/lib/python2.7/dist-packages/fuzzywuzzy/string_processing.py", line 26, in replace_non_letters_non_numbers_with_whitespace
return cls.regex.sub(" ", a_string)
TypeError: expected string or buffer