I have a functional code which split/merges multiple paragraphs into 1, it works correctly :
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
class DocxRefactor:
def __init__(self, filename):
self.document = Document(filename)
self.block_paragraphs = []
self.current_block = ""
self.last_alignment = None
def append_block(self):
if self.current_block.strip():
self.block_paragraphs.append(self.current_block.strip())
print(self.current_block.strip())
self.current_block = ""
def process_paragraphs(self):
for paragraph in self.document.paragraphs:
self.process_paragraph(paragraph)
self.append_block()
def process_paragraph(self, paragraph):
is_empty = all(len(run.text.strip()) == 0 for run in paragraph.runs)
if paragraph.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER:
self.process_center_paragraph(paragraph, is_empty)
elif self.last_alignment == WD_PARAGRAPH_ALIGNMENT.CENTER and self.current_block:
self.append_block()
self.last_alignment = paragraph.alignment
def process_center_paragraph(self, paragraph, is_empty):
if is_empty:
self.append_block()
else:
for run in paragraph.runs:
self.process_run(run)
def process_run(self, run):
if run.bold and run.text.strip():
if "\n(" in run.text.strip():
text = run.text.strip().replace('\n(', " (")
self.current_block += text + " "
else:
self.current_block += run.text + " "
if run.text.endswith("\n"):
self.append_block()
def save(self, filename):
self.document.save(filename)
if __name__ == "__main__":
refactored = DocxRefactor('original.docx')
refactored.process_paragraphs()
refactored.save('with_soft_return.docx')
Now I am trying to update the code such that rather than printing it modifies the doc by removing such paragraphs and adding one modified one, based on some doc it seems it not trivial - https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907
so I am planning to create the copy of the doc but with this apporach I loose all my formatting of the existing doc -
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
class DocxRefactor:
def __init__(self, filename):
self.document = Document(filename)
self.new_document = Document()
self.current_block = ""
self.last_alignment = None
def add_run(self, paragraph, run):
new_run = paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
if run.font.size:
new_run.font.size = Pt(run.font.size.pt)
new_run.font.name = run.font.name
new_run.font.color.rgb = run.font.color.rgb
# TODO: copy over any other formatting details you care about
def append_block(self):
if self.current_block.strip():
paragraph = self.new_document.add_paragraph()
paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
paragraph.add_run(self.current_block.strip())
self.current_block = ""
def process_paragraphs(self):
for paragraph in self.document.paragraphs:
self.process_paragraph(paragraph)
self.append_block()
def process_paragraph(self, paragraph):
if paragraph.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER:
self.process_center_paragraph(paragraph)
else:
self.append_block()
self.append_other_paragraph(paragraph)
self.last_alignment = paragraph.alignment
def process_center_paragraph(self, paragraph):
self.current_block += paragraph.text + " "
def append_other_paragraph(self, paragraph):
new_paragraph = self.new_document.add_paragraph()
new_paragraph.alignment = paragraph.alignment
for run in paragraph.runs:
self.add_run(new_paragraph, run)
def save(self, filename):
self.new_document.save(filename)
if __name__ == "__main__":
refactored = DocxRefactor('original.docx')
refactored.process_paragraphs()
refactored.save('with_soft_return.docx')
any suggestions on how to merge multiple paragraphs into one ?