Python: Deque Unable to load specific number of rows from bottom of csv while including the 1st row header column names

17 Views Asked by At

CSV has 8000 rows. The code needs to pick up "n" rows (example: 1500) from the bottom of the csv, along with the header column names.

The data from csv looks like this:

Ticker Name Date Open High Low Close Volume
HINDOILEXP HINDUSTAN OIL EXPLORATION 14-12-94 31.30 31.30 31.30 31.30 920
HINDOILEXP HINDUSTAN OIL EXPLORATION 15-12-94 31.30 31.52 31.30 31.52 805
HINDOILEXP HINDUSTAN OIL EXPLORATION 16-12-94 32.61 34.78 32.61 34.78 460

The script below is using deque. And is giving this message on console:

R:\PyRAM>python myres3.py

Loading data from R:/PyRAM/db as mentioned in inputs.json

Missing columns in file R:/PyRAM/db\A459.csv: Ticker, Name, Date, Open, High, Low, Close, Volume

Below is the code that is unable to do the job (which is: loading last "n" lines with the 1st header column names row)

Code:

import pandas as pd
import json
from datetime import datetime
import glob
import os
import concurrent.futures
import sys
import time
import msvcrt
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import Future
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process
import math
from tqdm import tqdm 
from collections import deque
import io

# Define a function to load CSV files and return a list of DataFrames
def load_csv_files(folder_path, required_columns, max_rows):
    csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
    if not csv_files:
        print("No CSV files found in the folder.")
        print("Press ENTER to wait or ESC to exit...")
        while True:
            if msvcrt.kbhit() and msvcrt.getch() == b'\x1b':
                exit(0)
            if os.path.exists(os.path.join(folder_path, '*.csv')):
                break
            continue

    all_data = []
    for file in csv_files:
        try:
            # Use deque to store the last 'max_rows' rows
            rows_to_lift = deque(maxlen=max_rows)
            
            with open(file, 'r') as csv_file:
                for line in csv_file:
                    rows_to_lift.append(line)
            
            # Reconstruct the CSV data with the header and lifted rows
            lifted_data = list(rows_to_lift)
            
            # Create a DataFrame from the lifted data
            data = pd.read_csv(io.StringIO(''.join(lifted_data)), header=0)  # Specify that the first row is the header
            
            # Check for missing columns
            missing_columns = [col for col in required_columns if col not in data.columns]
            if missing_columns:
                print(f"Missing columns in file {file}: {', '.join(missing_columns)}")
                continue

            data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%y')
            data.sort_values('Date', inplace=True)
            all_data.append(data)
            excel_file_path = os.path.join(folder_path, f"loaded_data_{len(all_data)}.xlsx")
            data.to_excel(excel_file_path, index=False)
            print(f"DataFrame {len(all_data)} saved to {excel_file_path}")

        except FileNotFoundError:
            print(f"Error: CSV file {file} not found.")
            continue

    return all_data
0

There are 0 best solutions below