I have doc files embedded with excel, ppt, pdf, etc... And I want to extract these embedded files, I am using python. Extracting from docx is straight with zipfile lib. With doc, I am able to get the .ole files (see below code that is based on something I found online), but how can I extract the actual files after that?
# extract embedded using OleFileIO_PL alone
import os
import OleFileIO_PL
def extract_embedded_ole(fname):
ole = OleFileIO_PL.OleFileIO(fname)
i = 0
for stream in ole.listdir():
for s in stream:
if isinstance(stream, list) and len(stream) > 1:
i += 1
if ole.get_type(stream) == 2 and s in ['Workbook', 'WordDocument', 'Package', 'WordDocument', 'VisioDocument', 'PowerPoint Document', 'Book', 'CONTENTS']:
ole_stream = ole.openstream(stream)
ole_props = ole.getproperties(['\x05SummaryInformation'])
out_dir = "res--" + fname + ".embeddings/" + "/".join(stream[:-1])
try:
os.makedirs(out_dir)
except OSError:
pass
# Write out Streams
out_name = out_dir + "/" + os.path.split(fname)[1] + "-emb-" + s + "-" + str(i) + ".ole"
with open(out_name, 'w+b') as out_file:
out_file.write(ole_stream.read())
# Example usage
extract_embedded_ole("f1.doc")