How to fix a FileNotFoundError while accessing files in a corpus

143 Views Asked by At

I'm trying to write code that access files from a corpus called Mini-CORE. I had no problems printing the listdir and extracting the genre codes from them. However, when I try to access the files themselves to extract the text It's giving me FileNotFoundError: [Errno 2] No such file or directory: '1+IN+EN+IN-IN-IN-IN+EN-EN-EN-EN+WIKI+9990014.txt' which is the first file name in the folder. So I'm confused why it's telling me the filename if it's claiming it doesn't exist? Have I just made a syntax error somewhere?

import os
import re
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en')

entries = os.listdir('Mini-CORE')
entry_list = []

# this returns the genre codes for each file
def genre_code(filename):
    for entry in entries:
        regex1 = r'((?<=1\+)\w*)'  # This captures the genre code
        genre = re.findall(regex1, entry)
        entry_list.append(genre)
genre_code(entries)
print(entry_list)


# FileNotFoundError???
# This captures the text after after the <h> or <p> tags
def relevant_text(filename):
        for filename in entries:
            with open(filename) as current_file:
                text = current_file.read()
                regex2 = r'((?<=<h>|<p>).*)'
                text2 = re.findall(regex2, text)
                print(text2)

print(relevant_text(entries))

1

There are 1 best solutions below

0
On

os.listdir returns the filename without its path. You need the file's parent directory when opening the file. pathlib is a object oriented path library that makes it easier to pass paths around without worrying about directory and path names.

Use Path.glob to list the directory and the returned paths will have both the file name and its path available for your program to use. With some cleanup your code could be

from pathlib import Path
import re
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en')

entries = Path('Mini-CORE').glob("*")

# this returns the genre codes for each file
def genre_code(entries):
    entry_list = []
    for entry in entries:
        regex1 = r'((?<=1\+)\w*)'  # This captures the genre code
        genre = re.findall(regex1, entry.name)
        entry_list.append(genre)
    return entry_list
    
entry_list = genre_code(entries)
print(entry_list)

# This captures the text after after the <h> or <p> tags
def relevant_text(entries):
        for filename in entries:
            with open(filename) as current_file:
                text = current_file.read()
                regex2 = r'((?<=<h>|<p>).*)'
                text2 = re.findall(regex2, text)
                print(text2)

print(relevant_text(entries))