python docx: merging/deleting multiple paragraphs and replacing it with one

19 Views Asked by At

I have a functional code which split/merges multiple paragraphs into 1, it works correctly :

from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT


class DocxRefactor:
    def __init__(self, filename):
        self.document = Document(filename)
        self.block_paragraphs = []
        self.current_block = ""
        self.last_alignment = None

    def append_block(self):
        if self.current_block.strip():
            self.block_paragraphs.append(self.current_block.strip())
            print(self.current_block.strip())
        self.current_block = ""

    def process_paragraphs(self):
        for paragraph in self.document.paragraphs:
            self.process_paragraph(paragraph)
        self.append_block()

    def process_paragraph(self, paragraph):
        is_empty = all(len(run.text.strip()) == 0 for run in paragraph.runs)

        if paragraph.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER:
            self.process_center_paragraph(paragraph, is_empty)
        elif self.last_alignment == WD_PARAGRAPH_ALIGNMENT.CENTER and self.current_block:
            self.append_block()

        self.last_alignment = paragraph.alignment

    def process_center_paragraph(self, paragraph, is_empty):
        if is_empty:
            self.append_block()
        else:
            for run in paragraph.runs:
                self.process_run(run)

    def process_run(self, run):
        if run.bold and run.text.strip():
            if "\n(" in run.text.strip():
                text = run.text.strip().replace('\n(', " (")
                self.current_block += text + " "
            else:
                self.current_block += run.text + " "
                if run.text.endswith("\n"):
                    self.append_block()

    def save(self, filename):
        self.document.save(filename)


if __name__ == "__main__":
    refactored = DocxRefactor('original.docx')
    refactored.process_paragraphs()
    refactored.save('with_soft_return.docx')

Now I am trying to update the code such that rather than printing it modifies the doc by removing such paragraphs and adding one modified one, based on some doc it seems it not trivial - https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907

so I am planning to create the copy of the doc but with this apporach I loose all my formatting of the existing doc -

from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT


class DocxRefactor:
    def __init__(self, filename):
        self.document = Document(filename)
        self.new_document = Document()
        self.current_block = ""
        self.last_alignment = None

    def add_run(self, paragraph, run):
        new_run = paragraph.add_run(run.text)
        new_run.bold = run.bold
        new_run.italic = run.italic
        new_run.underline = run.underline
        if run.font.size:
            new_run.font.size = Pt(run.font.size.pt)
        new_run.font.name = run.font.name
        new_run.font.color.rgb = run.font.color.rgb
        # TODO: copy over any other formatting details you care about

    def append_block(self):
        if self.current_block.strip():
            paragraph = self.new_document.add_paragraph()
            paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
            paragraph.add_run(self.current_block.strip())
        self.current_block = ""

    def process_paragraphs(self):
        for paragraph in self.document.paragraphs:
            self.process_paragraph(paragraph)
        self.append_block()

    def process_paragraph(self, paragraph):
        if paragraph.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER:
            self.process_center_paragraph(paragraph)
        else:
            self.append_block()
            self.append_other_paragraph(paragraph)

        self.last_alignment = paragraph.alignment

    def process_center_paragraph(self, paragraph):
        self.current_block += paragraph.text + " "

    def append_other_paragraph(self, paragraph):
        new_paragraph = self.new_document.add_paragraph()
        new_paragraph.alignment = paragraph.alignment
        for run in paragraph.runs:
            self.add_run(new_paragraph, run)

    def save(self, filename):
        self.new_document.save(filename)


if __name__ == "__main__":
    refactored = DocxRefactor('original.docx')
    refactored.process_paragraphs()
    refactored.save('with_soft_return.docx')

any suggestions on how to merge multiple paragraphs into one ?

0

There are 0 best solutions below