Extracting replies to comments in a PDF file and sorting them

31 Views Asked by At

I'm working on a project which I need to extract the comments on a PDF file and sort them based on their issuing date and their replies (if there's any). Currently I'm using PdfReader from pypdf library which is working great for extracting the comments but I'm identifying the replies based on their positions (using "/Rect" to compare and find out if a comment is a reply to specific comment). The method seems to be working fine most of the time but, I was wondering if there's a smarter way of identifying the replies where I can be more sure of the procedure. Tried to use "/P" from the properties PdfReader but it seems to be identical for all comments on the same page.

Thanks a lot!

This is how I make my main df to process the properties:

    src = 'test.pdf'
    input1 = PdfReader(src)
    nPages = len(input1.pages)
    df_comments = pd.DataFrame()
    for i in range(nPages) :
        annotation = []
        page = []
        page0 = input1.pages[i]
        try :
            for annot in page0['/Annots'] :
                annotation.append(annot.get_object())
            page = [i+1] * len(annotation)
            page = pd.DataFrame(page)
            annotation = pd.DataFrame(annotation)
            df_temp = pd.concat([page, annotation], axis=1)
            df_comments = pd.concat([df_comments, df_temp], ignore_index=True)
        except : 
            # there are no annotations on this page
            pass

and this is how I extract the replies and sort them (there should be a smarter way!):

    #creating lists for comments and replies
    cmnt_list = []
    reply_list = []
    page_final = []
    comment_final = []
    Author_final = []
    Creation_date_final = []
    data2 = {'page': [],
            'loc_x0': [],
            'loc_y0': [],
            'loc_x1': [],
            'loc_y1': [],
            'comment': [],
            'reply': [],
            'Author': [],
            'Creation_date': []}
    cmnt_df_dummy = pd.DataFrame(data2)
    cmnt_reply = pd.DataFrame(data2)
    # cmnt_df_dummy = pd.concat([cmnt_df_dummy, cmnt_df.iloc[0:2]])
    unique, counts = np.unique(cmnt_df["page"], return_counts=True)
    for i, page_no in enumerate(unique):
        if counts[i] == 1:
            # Update everything, this is a page with single comment without reply!
            cmnt_reply = pd.concat([cmnt_reply, cmnt_df[cmnt_df['page'] == page_no]])
        else:
            cmnt_df_dummy = pd.concat([cmnt_df_dummy, cmnt_df[cmnt_df['page'] == unique[i]]])
            #cmnt_df_dummy = pd.concat([cmnt_df_dummy, cmnt_df[cmnt_df['page'] == 91]])
            j=0
            remove_list = [0]
            while len(cmnt_df_dummy['page']) > 1:
                #for j in range(len(cmnt_df_dummy['page'])-1):
                #if j ==0:
                if len(remove_list) ==1:
                    cmnt_reply = pd.concat([cmnt_reply, cmnt_df_dummy.iloc[0:1]])
                remove_list = [0]
                for k in range(len(cmnt_df_dummy['page'])-1):
                    test1 = bool(abs((cmnt_df_dummy['loc_x0'].iloc[0] - cmnt_df_dummy['loc_x0'].iloc[k+1])/cmnt_df_dummy['loc_x0'].iloc[0]*100)<0.002)
                    test2 = bool(abs((cmnt_df_dummy['loc_y0'].iloc[0] - cmnt_df_dummy['loc_y0'].iloc[k+1])/cmnt_df_dummy['loc_y0'].iloc[0]*100)<0.002)
                    test3 = bool(abs((cmnt_df_dummy['loc_x1'].iloc[0] - cmnt_df_dummy['loc_x1'].iloc[k+1])/cmnt_df_dummy['loc_x1'].iloc[0]*100)<0.002)
                    test4 = bool(abs((cmnt_df_dummy['loc_y1'].iloc[0] - cmnt_df_dummy['loc_y1'].iloc[k+1])/cmnt_df_dummy['loc_y1'].iloc[0]*100)<0.002)
                    test_sum = test1 + test2 + test3 + test4
                    if test_sum >= 2:
                        #This is a reply! Collect it and go for the next ones!
                        cmnt_df_dummy.iat[k+1, 6] = cmnt_df_dummy.iat[k+1, 5]
                        cmnt_df_dummy.iat[k+1, 5] = None
                        cmnt_reply = pd.concat([cmnt_reply, cmnt_df_dummy.iloc[k+1:k+2]])
                        remove_list = np.append(remove_list, k+1)
                #remove the first row in order to go for the next round of investigation
                cmnt_df_dummy.drop(cmnt_df_dummy.index[remove_list],axis=0,inplace=True)
                if len(remove_list) > 1:
                    cmnt_reply = pd.concat([cmnt_reply, cmnt_df_dummy.iloc[0:1]])       
                    #remove the reply row
                j = j+1
            cmnt_df_dummy = pd.DataFrame(data2)
0

There are 0 best solutions below