Manipulate Excel Data Before Adding to Database with Django - Order of Operations?

295 Views Asked by At

I receive 6 weekly excel reports that I've been manually compiling into a very large monthly report. Each report has between 5-30 columns, and 4000 to 130,000 rows.

I'm putting together a simple Django app that allows you to upload each report, and the data ends up in the database.

Here's my models.py:

#UPEXCEL models
from django.db import models


############## LISTS ###############

class TransactionTypeList(models.Model):
    name = models.CharField(max_length=100)

    def __str__(self):
        return self.name

class TransactionAppTypeList(models.Model):
    name = models.CharField(max_length=100)

    def __str__(self):
        return self.name

class CrmCaseOriginList(models.Model):
    name = models.CharField(max_length=100)

    def __str__(self):
        return self.name



############## CLIENTS AND STAFF ###############

class Staff(models.Model):
    name = models.CharField(max_length=40)
    employee_id = models.CharField(max_length=40)
    start_date = models.TimeField(blank=True, null=True)
    end_date = models.DateField(blank=True, null=True)
    first_name = models.CharField(blank=True, null=True, max_length=40)
    last_name = models.CharField(blank=True, null=True, max_length=40)
    email = models.EmailField(blank=True, null=True)
    phone = models.CharField(blank=True, null=True, max_length=20)
    street = models.CharField(blank=True, null=True, max_length=100)
    city = models.CharField(blank=True, null=True, max_length=100)
    state = models.CharField(blank=True, null=True, max_length=2)
    zipcode = models.CharField(blank=True, null=True, max_length=10)
    is_team_lead = models.BooleanField(default=False)
    boss = models.ForeignKey('Staff', related_name='Boss', null=True, blank=True)

    def __str__(self):
        return self.name

    class Meta:
        app_label="upexcel"


class Client(models.Model):
    name = models.CharField(max_length=40)
    short_name = models.CharField(max_length=20, blank=True, null=True)
    start_date = models.DateField(default=timezone.now, blank=True, null=True)
    end_date = models.DateField(blank=True, null=True)
    team_lead = models.ForeignKey(Staff, related_name='client_team_lead')

    def __str__(self):
        return self.name

class ClientNameChart(models.Model):
    client_name = models.ForeignKey(Client, related_name='client_corrected_name')
    name_variation = models.CharField(max_length=100)
    date_added = models.DateTimeField(auto_now_add=True)

    def __str__(self):
        return '%s becomes %s' % (self.name_variation, self.client_name)

class StaffNameChart(models.Model):
    staff_name = models.ForeignKey(Staff, related_name='staff_corrected_name')
    name_variation = models.CharField(max_length=100)
    date_added = models.DateTimeField(auto_now_add=True)

    def __str__(self):
        return '%s becomes %s' % (self.name_variation, self.staff_name)



############## DATA FROM REPORTS ###############

class CrmNotes(models.Model):
    created_by = models.ForeignKey(Staff, related_name='note_creator')
    case_origin = models.CharField(max_length=20)
    client_regarding = models.ForeignKey(Client, related_name='note_client_regarding')
    created_on = models.DateTimeField()
    case_number = models.CharField(max_length=40)

class Transactions(models.Model):
    client_regarding = models.ForeignKey(Client, related_name='transaction_client')
    created_by = models.ForeignKey(Staff, related_name='transaction_creator')
    type = models.ForeignKey(TransactionTypeList, related_name='transaction_type')
    app_type = models.ForeignKey(TransactionAppTypeList, related_name='transaction_app_type')

    class Meta:
        app_label="upexcel"

class Timesheets(models.Model):
    staff = models.ForeignKey(Staff, related_name='staff_clocked_in')
    workdate = models.DateField()
    start_time = models.DateTimeField()
    end_time = models.DateTimeField()
    total_hours = models.DecimalField(decimal_places=2, max_digits=8)

class Provider(models.Model):
    name = models.CharField(max_length=40)
    street = models.CharField(max_length=100)
    city = models.CharField(max_length=40)
    state = models.CharField(max_length=11)
    zip = models.CharField(max_length=10)

class StudentsApplication(models.Model):
    app_number = models.CharField(max_length=40)
    program = models.CharField(max_length=40)
    benefit_period = models.CharField(max_length=40)
    student_name = models.CharField(max_length=40)
    student_empl_id = models.CharField(max_length=40)
    requested_amount = models.DecimalField(max_digits=8, decimal_places=2)
    provider = models.ForeignKey(Provider, related_name='app_provider')
    provider_code = models.CharField(max_length=40)

class AuditReport(models.Model):
    was_audited = models.BooleanField(default=False)
    auditor = models.ForeignKey('upexcel.Staff', related_name='auditor')
    payment_defect = models.BooleanField(default=False)
    grant_discount_error = models.BooleanField(default=False)
    math_error = models.BooleanField(default=False)
    fees_book_error = models.BooleanField(default=False)
    other_error = models.BooleanField(default=False)
    overpayment_amount = models.DecimalField(max_digits=8, decimal_places=2)
    underpayment_amount = models.DecimalField(max_digits=8, decimal_places=2)
    doc_defect = models.BooleanField(default=False)
    status_change = models.BooleanField(default=False)
    admin_savings_defect = models.BooleanField(default=False)
    network_savings_defect = models.BooleanField(default=False)
    admin_adjustments = models.DecimalField(max_digits=8, decimal_places=2)
    network_adjustments = models.DecimalField(max_digits=8, decimal_places=2)
    error_corrected = models.BooleanField(default=False)
    comments = models.TextField(max_length=500)
    client = models.ForeignKey(Client, related_name='audited_client')
    staff = models.ForeignKey(Staff, related_name='processor_audited')
    application = models.ForeignKey(StudentsApplication, related_name='app_audited')

    class Meta:
        app_label="upexcel"

However the excel reports I'm taking in need some work done to them, and I'm trying figure out exactly how I should go about processing them and routing them.

The first challenge is that each report references the associated Staff and Client with different data. For example, if the Staff.name is "Bob Dole", one report has it as "Dole, Bob". Another has it as "Dole, Robert". Still another has "Robert Dole" then "103948210", which is his employee ID number.

Also, these change and new ones sprout up, which is why I made ClientNameChart and StaffNameChart, to where a user can input the string as it shows up in a report, and attach it to a Client or Staff. Then when processing, we can lookup StaffNameChart.name_variation, and return the associated StaffNameChart.Staff.employee_id, which should work great as a foreign key within the respective report's table (ie. AuditReport.staff)

The second challenge is to take a report, and route some of the columns to one database table, and others to another. For example, the big one is the Audit Report sheet. Many of the columns just transpose directly into the AuditReport(models.Model). However, it also has data for each StudentsApplication and Provider, where I need to take several columns, store them as a new record in their destination table, and replace the columns with one column containing a foreign key for that item within that destination table.

So that is my quest.

Here's the order of operations I have in my head - I will use the most complex Audit_Report_Wk_1.xlsx report to address all challenges in one upload:

  1. Upload File
  2. Using openpyxl, load read-only data:

    from openpyxl.worksheet.read_only import ReadOnlyWorksheet
    myexcelfile = request.FILES['file']
    myworkbook = load_workbook(myexcelfile, read_only=True)
    mysheet = myworkbook['Sheet1']
    
  3. Write a script that matches the names strings of the staff, auditor, and client columns with StaffNameChart.name_variation, and replace it with StaffNameChart.Staff.name.

  4. Part B: If the client or staff columns are blank, or contain strings not found in the name charts, all of those rows get saved in a new excel document. Edit: I suppose I could also create a new model class called IncompleteAuditReport that just have fields that match up with each column and store it there, then if someone adds a new NameChart variation, it could trigger a quick look-up to see if that could allow this process to complete and the record to be properly added?)
  5. Check the columns in mysheet that will be replaced by foreign keys for the Provider and StudentsApplication tabes. If their respective data doesn't yet exist in their respective tables, add the new record. Either way, then replace their columns with the foreign key that points to the resulting record.

Is this the correct order of operations? Any advice on what specific tools to use from openpyxl etc. to manipulate the data in the most efficient ways, so I can use the fewest resources possible to look-up and then change several hundred thousand fields?

Thank you so much if you've read this far. I'm currently a bit intimidated by the more complex data types, so it's not crystal clear to me the best way to store the data in memory and to manipulate it while it's there.

0

There are 0 best solutions below