How can I to read a csv file from ADLS GEN2 using Python?

254 Views Asked by At

I’d like to read a file from ADLS (Azure) by using Python cron my desktop, by now I Just have the https string to get inri the container, there is any option to make it?

1

There are 1 best solutions below

0
Bhavani On

You can read csv file from ADLS using below code:

import io
from azure.storage.blob import BlobServiceClient
import pandas as pd

def read_csv_from_adls(account_url, container_name, file_path, sas_token):

    blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(file_path) 
    blob_data = blob_client.download_blob()
    csv_data = blob_data.content_as_text()
    df = pd.read_csv(io.StringIO(csv_data))
    return df

account_url = "<URL>"
container_name = "<containerName>"
file_path = "<csv_file>"
sas_token = "<SAStoken>"
df = read_csv_from_adls(account_url, container_name, file_path, sas_token)
print(df)

It will read csv data from ADLS:

enter image description here

To schedule a cron job you can use below code:

def read_csv_job():

    # Provide the required information

    account_url = "<URL>"
    container_name = "<containerName>"
    file_path = "<csv_file>"
    sas_token = "<SAStoken>""
    df = read_csv_from_adls(account_url, container_name, file_path, sas_token)
    if df is not None:
        print(df)
    else:
        print("No output generated.")
        
schedule.every().day.at("<time>").do(read_csv_job)

while True:
    schedule.run_pending()
    time.sleep(1)

The job will schedule at given time without any error.

Complete code:

import io
from azure.storage.blob import BlobServiceClient
import pandas as pd
import schedule
import time

def read_csv_from_adls(account_url, container_name, file_path, sas_token):
    try:
        blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
        container_client = blob_service_client.get_container_client(container_name)
        blob_client = container_client.get_blob_client(file_path)
        blob_data = blob_client.download_blob()
        csv_data = blob_data.content_as_text()
        df = pd.read_csv(io.StringIO(csv_data))
        return df
    except Exception as e:
        print("An error occurred while reading the CSV file:")
        print(str(e))
        return None
def read_csv_job():
    account_url = "https://adlss.dfs.core.windows.net"
    container_name = "files"
    file_path = "inputs\students"
    sas_token = "?sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2023-07-06T15:01:08Z&st=2023-07-06T07:01:08Z&spr=https&sig=xzAQoE0hMQJxbRCIvr38Ya6eRy76QKfVdDEIVAFs1VE%3D"
    df = read_csv_from_adls(account_url, container_name, file_path, sas_token)
    if df is not None:
        print(df)
    else:
        print("No output generated.")
schedule.every().day.at("12:53").do(read_csv_job)
while True:
    schedule.run_pending()
    time.sleep(1)