Create nested dictionary based on some custom rules

68 Views Asked by At

I have a python dictionary as follows:

ip_dict = {
    "img_folder/144-64ee3d9bb7-3.png": "COMMERCIAL PROPERTY ",
    "img_folder/144-64ee3d9bb7-2.png": "CBIC COMMERCIAL ",
    "img_folder/144-64ee3d9bb7-4.png": "CBIC COMMERCIAL GENERAL",
    "img_folder/144-64ee3d9bb7-1.png": "Contractors Bonding",
    "img_folder/144-64ee3d9bb7-5.png": "CBIC",
    "img_folder/Excess-Liability-8.png": "  Energy laswance ",
    "img_folder/144-64ee3d9bb7-0.png": "CONTRACTORS BONDING AND INSURANCE ",
    "img_folder/Excess-Liability-10.png": "  FOLLOWING FORM",
    "img_folder/Excess-Liability-14.png": "  (2) property and",
    "img_folder/Excess-Liability-0.png": "  Energy ",
    "img_folder/Excess-Liability-5.png": "  The additional premium",
    "img_folder/Excess-Liability-3.png": "Ein Enos asurance Maral",
    "img_folder/Excess-Liability-4.png": "  IV. Conditions ",
    "img_folder/Excess-Liability-13.png": "  FOLLOWING FORM ",
    "img_folder/Excess-Liability-12.png": "  FOLLOWING FORM EXCESS",
    "img_folder/Excess-Liability-9.png": "  Surplus Lines",
    "img_folder/Excess-Liability-11.png": "  ALL OTHER TERMS",
    "img_folder/Excess-Liability-2.png": "  Il. Limit of",
    "img_folder/Excess-Liability-6.png": "  (G) Notice of",
    "img_folder/Excess-Liability-7.png": "Ss So Ss   The ",
    "img_folder/Excess-Liability-1.png": "eee ee ee"
}

It contains text extracted from pages of 2 different pdf files (144-64ee3d9bb7-3 and Excess-Liability). I want to convert the above dictionary into a nested dictionary where the global key is the pdf name and the nested dictionary is the same as above. So the output would look like following:

op_dict = {
    "144-64ee3d9bb7.png": {
    "img_folder/144-64ee3d9bb7-3.png": "COMMERCIAL PROPERTY ",
    "img_folder/144-64ee3d9bb7-2.png": "CBIC COMMERCIAL ",
    "img_folder/144-64ee3d9bb7-4.png": "CBIC COMMERCIAL GENERAL",
    "img_folder/144-64ee3d9bb7-1.png": "Contractors Bonding",
    "img_folder/144-64ee3d9bb7-5.png": "CBIC",
    "img_folder/144-64ee3d9bb7-0.png": "CONTRACTORS BONDING AND INSURANCE "
    },
    "Excess Liability.png": {
    "img_folder/Excess Liability-8.png": "  Energy laswance ",
    "img_folder/Excess Liability-10.png": "  FOLLOWING FORM",
    "img_folder/Excess Liability-14.png": "  (2) property and",
    "img_folder/Excess Liability-0.png": "  Energy ",
    "img_folder/Excess Liability-5.png": "  The additional premium",
    "img_folder/Excess Liability-3.png": "Ein Enos asurance Maral",
    "img_folder/Excess Liability-4.png": "  IV. Conditions ",
    "img_folder/Excess Liability-13.png": "  FOLLOWING FORM ",
    "img_folder/Excess Liability-12.png": "  FOLLOWING FORM EXCESS",
    "img_folder/Excess Liability-9.png": "  Surplus Lines",
    "img_folder/Excess Liability-11.png": "  ALL OTHER TERMS",
    "img_folder/Excess Liability-2.png": "  Il. Limit of",
    "img_folder/Excess Liability-6.png": "  (G) Notice of",
    "img_folder/Excess Liability-7.png": "Ss So Ss   The ",
    "img_folder/Excess Liability-1.png": "eee ee ee"
    }
}

I tried the below logic but it is not working as expected:

op_dict = {}
for key, value in ip_dict.items():
    doc_name = key.split("/")[-1]
    if doc_name not in op_dict:
        op_dict[doc_name] = {}
    op_dict[doc_name][key] = value

Any help is appreciated!

2

There are 2 best solutions below

0
On

You also need to remove number in the end and add extension in file name.

op_dict = {}
for key, value in ip_dict.items():
    doc_name_with_number = key.split("/")[-1]
    array_without_number = doc_name_with_number.split("-")[:-1]
    doc_name = "-".join(array_without_number)
    doc_name_with_extension = f"{doc_name}.png"
    if doc_name_with_extension not in op_dict:
        op_dict[doc_name_with_extension] = {}
    op_dict[doc_name_with_extension][key] = value
0
On

As I understood you need remove unique number from document name. You can do it as follows (if you need file format):

import re

op_dict = {}
for key, value in ip_dict.items():
    doc_name = key.split("/")[-1]
    doc_name = "".join(re.split(r"-\d+(\.\w+)$", doc_name))
    if doc_name not in op_dict:
        op_dict[doc_name] = {}
    op_dict[doc_name][key] = value

In this case you will get the following names: 144-64ee3d9bb7.png, Excess-Liability.png

Or if you need th only name (without file format)

import re

op_dict = {}
for key, value in ip_dict.items():
    doc_name = key.split("/")[-1]
    doc_name = re.split(r"-\d+\.\w+$", doc_name)[0]
    if doc_name not in op_dict:
        op_dict[doc_name] = {}
    op_dict[doc_name][key] = value

In this case you will get the following names: 144-64ee3d9bb7, Excess-Liability