CSV has 8000 rows. The code needs to pick up "n" rows (example: 1500) from the bottom of the csv, along with the header column names.
The data from csv looks like this:
| Ticker | Name | Date | Open | High | Low | Close | Volume |
|---|---|---|---|---|---|---|---|
| HINDOILEXP | HINDUSTAN OIL EXPLORATION | 14-12-94 | 31.30 | 31.30 | 31.30 | 31.30 | 920 |
| HINDOILEXP | HINDUSTAN OIL EXPLORATION | 15-12-94 | 31.30 | 31.52 | 31.30 | 31.52 | 805 |
| HINDOILEXP | HINDUSTAN OIL EXPLORATION | 16-12-94 | 32.61 | 34.78 | 32.61 | 34.78 | 460 |
The script below is using deque. And is giving this message on console:
R:\PyRAM>python myres3.py
Loading data from R:/PyRAM/db as mentioned in inputs.json
Missing columns in file R:/PyRAM/db\A459.csv: Ticker, Name, Date, Open, High, Low, Close, Volume
Below is the code that is unable to do the job (which is: loading last "n" lines with the 1st header column names row)
Code:
import pandas as pd
import json
from datetime import datetime
import glob
import os
import concurrent.futures
import sys
import time
import msvcrt
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import Future
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process
import math
from tqdm import tqdm
from collections import deque
import io
# Define a function to load CSV files and return a list of DataFrames
def load_csv_files(folder_path, required_columns, max_rows):
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
if not csv_files:
print("No CSV files found in the folder.")
print("Press ENTER to wait or ESC to exit...")
while True:
if msvcrt.kbhit() and msvcrt.getch() == b'\x1b':
exit(0)
if os.path.exists(os.path.join(folder_path, '*.csv')):
break
continue
all_data = []
for file in csv_files:
try:
# Use deque to store the last 'max_rows' rows
rows_to_lift = deque(maxlen=max_rows)
with open(file, 'r') as csv_file:
for line in csv_file:
rows_to_lift.append(line)
# Reconstruct the CSV data with the header and lifted rows
lifted_data = list(rows_to_lift)
# Create a DataFrame from the lifted data
data = pd.read_csv(io.StringIO(''.join(lifted_data)), header=0) # Specify that the first row is the header
# Check for missing columns
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
print(f"Missing columns in file {file}: {', '.join(missing_columns)}")
continue
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%y')
data.sort_values('Date', inplace=True)
all_data.append(data)
excel_file_path = os.path.join(folder_path, f"loaded_data_{len(all_data)}.xlsx")
data.to_excel(excel_file_path, index=False)
print(f"DataFrame {len(all_data)} saved to {excel_file_path}")
except FileNotFoundError:
print(f"Error: CSV file {file} not found.")
continue
return all_data