highlight text using python-docx

15.2k Views Asked by At

I want to highlight text in docx and save it another file. here is my code

from docx import Document

def highlight_text(filename):

    doc = Document(filename)
    for p in doc.paragraphs:
        if 'vehicle' in p.text:
            inline = p.runs
            # print(inline)
            # Loop added to work with runs (strings with same style)
            for i in range(len(inline)):
                # print((inline[i].text).encode('ascii'))
                if 'vehicle' in inline[i].text:
                    x=inline[i].text.split('vehicle')
                    inline[i].clear()
                    for j in range(len(x)-1):
                        inline[i].add_text(x[j])
                        y=inline[i].add_text('vehicle')
                        y.highlight_color='YELLOW'
            # print (p.text)

    doc.save('t2.docx')
    return 1
if __name__ == '__main__':

    highlight_text('t1.docx')

word is not getting highlighted what i am doing wrong.

2

There are 2 best solutions below

4
scanny On BEST ANSWER

Highlighting is an attribute of a font, not a run directly. Also, Run.add_text() returns a _Text object, not a run.

from docx.enum.text import WD_COLOR_INDEX

for paragraph in document.paragraphs:
    if 'vehicle' in paragraph.text:
        for run in paragraph.runs:
            if 'vehicle' in run.text:
                x = run.text.split('vehicle')
                run.clear()
                for i in range(len(x)-1):
                    run.add_text(x[i])
                    run.add_text('vehicle')
                    run.font.highlight_color = WD_COLOR_INDEX.YELLOW

Also, a highlight is applied to the entire run, so you need to create a separate runs for each of the text before "vehicle", the "vehicle" word itself, and the text after "vehicle".

Also, there's no guarantee that a given word appears completely within a single run; runs often split within a word. So you'll need to be more sophisticated in your approach to handle the general case.

So there's quite a bit more work to do here, but this should get you seeing at least some yellow highlighting :)

1
Daniel Eriks Chacon Baquerizo On

this is my solution to this problem. It works with multiple tokens or sequence of characters. First we need to split the runs into multiple runs in my case a did 4 runs with reguex in order to highlight and comment a token. Then the function to highlight and comment .

import docx
import re
from docx.enum.text import WD_COLOR_INDEX
doc=docx.Document(yourwordPath)
def split_text(text, word):
    pattern = re.compile(r'([\S\s]*)(\b{})([\S\s]*)'.format(word))
    match = pattern.search(text)
    if match:
        return match.groups()
    return None
def split_Runs(doc,word):
    for p in doc.paragraphs:
        if p.text.find(word) != -1:
            virtualRuns=p.runs
            p.text = ""
            for r in virtualRuns:
                if r.text.find(word) != -1:
                    before, word, after = split_text(r.text, word)
                    p.add_run(before)
                    p.add_run()
                    p.add_run(word)
                    p.add_run(after)
                else:
                    p.add_run(r.text)
    return doc
    
def style_Token(doc,word,comment=True):
    for p in doc.paragraphs:
        for i,r in enumerate(p.runs):
            if p.runs[i].text.find(word) != -1:
                p.runs[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
                if comment:
                    p.runs[i-1].add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
                    #r.add_comment(f'{word} No se encuentra en el documento',author='BOT CONFRONT')
    return doc
#nums is the list of tokens that is going to be highlighted and comment
nums=['10231244','48023851','20104802385']
for num in nums:
    doc=split_Runs(doc,num)
for num in nums:
    doc=style_Token(doc,num,True)
doc.save(yourwordPath)