How to extract headings from Word files to Excel?

2.5k Views Asked by At

I have hundreds of Word files (docx) which each have various headings, defined as Heading 1, Heading 2, Heading 3, etc. Each of these files has a table of contents which correspond to the headings.
I want to extract each heading from each of these files into an Excel workbook to build a database.

My first attempt was to extract the headings from a single Word document into an Excel workbook. I found code online to extract headings from Word to Outlook, and also separate code to extract headings from Word to a new Word file.
I haven't been able to adapt either of these.

How do I extract headings from a single Word file to Excel? I will then try to work out further steps.

Word to Outlook

Sub CopyHeadingsIntoOutlookMail()
    Dim objOutlookApp, objMail As Object
    Dim objMailDocument As Word.Document
    Dim objMailRange As Word.Range
    Dim varHeadings As Variant
    Dim i As Long
    Dim strText As String
    Dim nLongDiff As Integer

    'Create a new Outlook email
    Set objOutlookApp = CreateObject("Outlook.Application")
    Set objMail = objOutlookApp.CreateItem(olMailItem)
    objMail.Display
    Set objMailDocument = objMail.GetInspector.WordEditor
    Set objMailRange = objMailDocument.Range(0, 0)
 
    'Get the headings of the current Word document
    varHeadings = ActiveDocument.GetCrossReferenceItems(wdRefTypeHeading)

    For i = LBound(varHeadings) To UBound(varHeadings)
        strText = Trim(varHeadings(i))
 
        'Get the heading level
        nLongDiff = Len(RTrim$(CStr(varHeadings(i)))) - Len(Trim(CStr(varHeadings(i))))
        nHeadingLevel = (nLongDiff / 2) + 1
 
        'Insert the heading into the Outlook mail
        With objMailRange
            .InsertAfter strText & vbNewLine
            .Style = "Heading " & nHeadingLevel
            .Collapse wdCollapseEnd
        End With
    Next i
End Sub

Word to Word

Public Sub CreateOutline()
    Dim docOutline As Word.Document
    Dim docSource As Word.Document
    Dim rng As Word.Range
    
    Dim astrHeadings As Variant
    Dim strText As String
    Dim intLevel As Integer
    Dim intItem As Integer
        
    Set docSource = ActiveDocument
    Set docOutline = Documents.Add
    
    ' Content returns only the
    ' main body of the document, not
    ' the headers and footer.
    Set rng = docOutline.Content
    astrHeadings = _
     docSource.GetCrossReferenceItems(wdRefTypeHeading)
    
    For intItem = LBound(astrHeadings) To UBound(astrHeadings)
        ' Get the text and the level.
        strText = Trim$(astrHeadings(intItem))
        intLevel = GetLevel(CStr(astrHeadings(intItem)))
        
        ' Add the text to the document.
        rng.InsertAfter strText & vbNewLine
        
        ' Set the style of the selected range and
        ' then collapse the range for the next entry.
        rng.Style = "Heading " & intLevel
        rng.Collapse wdCollapseEnd
    Next intItem
End Sub

Private Function GetLevel(strItem As String) As Integer
    ' Return the heading level of a header from the
    ' array returned by Word.
    
    ' The number of leading spaces indicates the
    ' outline level (2 spaces per level: H1 has
    ' 0 spaces, H2 has 2 spaces, H3 has 4 spaces.
        
    Dim strTemp As String
    Dim strOriginal As String
    Dim intDiff As Integer
    
    ' Get rid of all trailing spaces.
    strOriginal = RTrim$(strItem)
    
    ' Trim leading spaces, and then compare with
    ' the original.
    strTemp = LTrim$(strOriginal)
    
    ' Subtract to find the number of
    ' leading spaces in the original string.
    intDiff = Len(strOriginal) - Len(strTemp)
    GetLevel = (intDiff / 2) + 1
End Function
1

There are 1 best solutions below

5
On

Try the following Excel macro. When you run it, simply select the folder to process.

Sub GetTOCHeadings()
'Note: this code requires a reference to the Word object model.
'See under the VBE's Tools|References.
Application.ScreenUpdating = False
Dim wdDoc As Word.Document, wdRng As Word.Range, wdPara As Word.Paragraph
Dim strFolder As String, strFile As String
Dim WkSht As Worksheet, i As Long, j As Long
strFolder = GetFolder
If strFolder = "" Then Exit Sub
Dim wdApp As New Word.Application
Set WkSht = ActiveSheet
i = WkSht.Cells(WkSht.Rows.Count, 1).End(xlUp).Row
wdApp.WordBasic.DisableAutoMacros
wdApp.DisplayAlerts = wdAlertsNone
strFile = Dir(strFolder & "\*.doc", vbNormal)
While strFile <> ""
  i = i + 1
  Set wdDoc = wdApp.Documents.Open(Filename:=strFolder & "\" & strFile, AddToRecentFiles:=False, Visible:=False)
  With wdDoc
    j = 1: WkSht.Cells(i, j) = strFile
    If .TablesOfContents.Count > 0 Then
      With .TablesOfContents(1)
        .IncludePageNumbers = False
        .Update
        Set wdRng = .Range
      End With
      With wdRng
        .Fields(1).Unlink
        For Each wdPara In .Paragraphs
          j = j + 1
          WkSht.Cells(i, j).Value = Replace(wdPara.Range.Text, vbTab, " ")
        Next
      End With
    End If
    .Close SaveChanges:=False
  End With
  strFile = Dir()
Wend
wdApp.DisplayAlerts = wdAlertsAll
wdApp.Quit
Set wdDoc = Nothing: Set wdApp = Nothing: Set WkSht = Nothing
Application.ScreenUpdating = True
End Sub
 
Function GetFolder() As String
    Dim oFolder As Object
    GetFolder = ""
    Set oFolder = CreateObject("Shell.Application").BrowseForFolder(0, "Choose a folder", 0)
    If (Not oFolder Is Nothing) Then GetFolder = oFolder.Items.Item.Path
    Set oFolder = Nothing
End Function