I am a newbie at programming and molecular dynamic simulations. I am using LAMMPS to simulate a physical vapor deposition (PVD) process and to determine the interactions between atoms in different timesteps.

After I perform a molecular dynamics simulation, LAMMPS provides me an output bonds file that contains records of every individual atom( as atom ID), their types (numbers reciprocating to particular elements), and information of other atoms that are bonded with those particular atoms. A typical bonds file looks like this.

I aim to sort atoms according to their types (like Group1: Oxygen-Hydrogen-Hydrogen) in three groups by considering their bonding information from bond output file and count the number of groups for each timestep. I used pandas and created a dataframe for each timestep.

df = pd.read_table(directory, comment="#", delim_whitespace= True, header=None, usecols=[0,1,2,3,4,5,6] )
headers= ["ID","Type","NofB","bondID_1","bondID_2","bondID_3","bondID_4"]
df.columns = headers
df.fillna(0,inplace=True)
df = df.astype(int)

timestep = int(input("Number of Timesteps: ")) #To display desired number of timesteps.
total_atom_number = 53924 #Total number of atoms in the simulation.

t= 0 #code starts from 0th timestep.
firstTime = []
while(t <= timestep):
    open('file.txt', 'w').close() #In while loop = displays every timestep individually, Out of the while loop = displays results cumulatively.
    i = 0
    df_tablo =(df[total_atom_number*t:total_atom_number*(t+1)]) #Creates a new dataframe that inlucdes only t'th timestep.
    df_tablo.reset_index(inplace=True)


    print(df_tablo)

Please see this example that illustrates my algorithm to group 3 atoms. Bond columns display different atoms (by atom IDs) that are bonded together with the atoms in their row. For instance, by utilizing the algorithm, we can group [1,2,5] and [1,2,6] but not [1,2,1] since an atom can not create a bond with itself. Furthermore, we can convert these atom IDs(first column) to their atom types (second column) after grouping, such as [1,3,7] to [1,1,3].

By following bonds as mentioned above, 1) I can successfully group atoms with respect to their IDs, 2) convert them to their atom types and 3) count how many groups exist in each timestep, respectively. The first while loop (above) counts groups for each timestep whereas, the second while loop (below) groups the atoms from each row (which is equal to each atom ID existed) with their corresponding bonded atoms from different rows in the dataframe.

 while i < total_atom_number:
    atom1_ID = df_tablo["ID"][i]  # atom ID of i'th row was defined.
    atom1_NB = df_tablo["NofB"][i]  # number of bonds of the above atom ID was defined, but not used.
    atom1_bond1 = df_tablo["bondID_1"][i]  #bond ID1 of above atom was defined.
        # bondIDs and atom types  of 1,2,3 and 4 for atom1_bond1 were defined respectively.
    if atom1_bond1 != 0:
        atom2_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond1))
        atom2_ID = df_tablo["ID"][atom2_index]
        atom2_bond1 = df_tablo["bondID_1"][atom2_index]
        atom2_bond2 = df_tablo["bondID_2"][atom2_index]
        atom2_bond3 = df_tablo["bondID_3"][atom2_index]
        atom2_bond4 = df_tablo["bondID_4"][atom2_index]


        type_atom1 = df_tablo["Type"][i]
        type_atom2 = df_tablo["Type"][atom2_index]
        #If the desired conditions are satisfied, atom types  are combined as [atom at i'th row, bondID1 at'ith row, 4 bondIDs respectively at the row which is equal to bondID1's row ]
        if atom1_ID != atom2_bond1 and atom2_bond1 != 0:
            set = [atom1_ID, atom2_ID, atom2_bond1]
            atom2_bond1_index = (df_tablo.set_index('ID').index.get_loc(atom2_bond1))
            type_atom2_bond1 = df_tablo["Type"][atom2_bond1_index]
            print("{}{}{}".format(type_atom1, type_atom2, type_atom2_bond1), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom2_bond2 and atom2_bond2 != 0:
            set = [atom1_ID, atom2_ID, atom2_bond2]
            atom2_bond2_index = (df_tablo.set_index('ID').index.get_loc(atom2_bond2))
            type_atom2_bond2 = df_tablo["Type"][atom2_bond2_index]
            print("{}{}{}".format(type_atom1, type_atom2, type_atom2_bond2), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom2_bond3 and atom2_bond3 != 0:
            set = [atom1_ID, atom2_ID, atom2_bond3]
            atom2_bond3_index = (df_tablo.set_index('ID').index.get_loc(atom2_bond3))
            type_atom2_bond3 = df_tablo["Type"][atom2_bond3_index]
            print("{}{}{}".format(type_atom1, type_atom2, type_atom2_bond3), file=open("file.txt", "a"))
            # print(set)
        if atom1_ID != atom2_bond4 and atom2_bond4 != 0:
            set = [atom1_ID, atom2_ID, atom2_bond4]
            atom2_bond4_index = (df_tablo.set_index('ID').index.get_loc(atom2_bond4))
            type_atom2_bond4 = df_tablo["Type"][atom2_bond4_index]
            print("{}{}{}".format(type_atom1, type_atom2, type_atom2_bond4), file=open("file.txt", "a"))
            # print(set)
    # bondIDs and atom types  of 1,2,3 and 4 for atom1_bond2 were defined respectively.
    atom1_bond2 = df_tablo["bondID_2"][i]
    if atom1_bond2 != 0:
        atom1_bond2_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond2) + 6)
        atom1_bond2_ID = df_tablo["ID"][atom1_bond2_index]
        atom1_bond2_bond1 = df_tablo["bondID_1"][atom1_bond2_index]
        atom1_bond2_bond2 = df_tablo["bondID_2"][atom1_bond2_index]
        atom1_bond2_bond3 = df_tablo["bondID_3"][atom1_bond2_index]
        atom1_bond2_bond4 = df_tablo["bondID_4"][atom1_bond2_index]

        type_atom1_bond2 = df_tablo["Type"][atom1_bond2_index] # If the desired conditions are satisfied, atom types  are combined as [atom at i'th row, bondID2 at'ith row, and 4 bondIDs respectively at the row which is equal to bondID2's row ]

        if atom1_ID != atom1_bond2_bond1 and atom1_bond2_bond1 != 0:
            set = [atom1_ID, atom1_bond2, atom1_bond2_bond1]
            atom1_bond2_bond1_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond2_bond1))
            type_atom1_bond2_bond1 = df_tablo["Type"][atom1_bond2_bond1_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond2, type_atom1_bond2_bond1), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom1_bond2_bond2 and atom1_bond2_bond2 != 0:
            set = [atom1_ID, atom1_bond2, atom1_bond2_bond2]
            atom1_bond2_bond2_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond2_bond2))
            type_atom1_bond2_bond2 = df_tablo["Type"][atom1_bond2_bond2_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond2, type_atom1_bond2_bond2), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom1_bond2_bond3 and atom1_bond2_bond3 != 0:
            set = [atom1_ID, atom1_bond2, atom1_bond2_bond3]
            atom1_bond2_bond3_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond2_bond3))
            type_atom1_bond2_bond3 = df_tablo["Type"][atom1_bond2_bond3_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond2, type_atom1_bond2_bond3), file=open("file.txt", "a"))
            # print(set)
        if atom1_ID != atom1_bond2_bond4 and atom1_bond2_bond4 != 0:
            set = [atom1_ID, atom1_bond2, atom1_bond2_bond4]
            atom1_bond2_bond4_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond2_bond4))
            type_atom1_bond2_bond4 = df_tablo["Type"][atom1_bond2_bond4_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond2, type_atom1_bond2_bond4), file=open("file.txt", "a"))
            # print(set)

    # bondIDs and atom types  of 1,2,3 and 4 for atom1_bond3 were defined respectively.
    atom1_bond3 = df_tablo["bondID_3"][i]
    if atom1_bond3 != 0:
        atom1_bond3_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond3))
        atom1_bond3_ID = df_tablo["ID"][atom1_bond3_index]
        atom1_bond3_bond1 = df_tablo["bondID_1"][atom1_bond3_index]
        atom1_bond3_bond2 = df_tablo["bondID_2"][atom1_bond3_index]
        atom1_bond3_bond3 = df_tablo["bondID_3"][atom1_bond3_index]
        atom1_bond3_bond4 = df_tablo["bondID_4"][atom1_bond3_index]

        type_atom1_bond3 = df_tablo["Type"][atom1_bond3_index]
        # If the desired conditions are satisfied, atom types  are combined as [atom at i'th row, bondID3 at'ith row, and 4 bondIDs respectively at the row which is equal to bondID3's row ]
        if atom1_ID != atom1_bond3_bond1 and atom1_bond3_bond1 != 0:
            atom1_bond3_bond1_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond3_bond1))
            type_atom1_bond3_bond1 = df_tablo["Type"][atom1_bond3_bond1_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond3, type_atom1_bond3_bond1), file=open("file.txt", "a"))
            set = [atom1_ID, atom1_bond3, atom1_bond3_bond1]
            # print(set)

        if atom1_ID != atom1_bond3_bond2 and atom1_bond3_bond2 != 0:
            set = [atom1_ID, atom1_bond3, atom1_bond3_bond2]
            atom1_bond3_bond2_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond3_bond2))
            type_atom1_bond3_bond2 = df_tablo["Type"][atom1_bond3_bond2_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond3, type_atom1_bond3_bond2), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom1_bond3_bond3 and atom1_bond3_bond3 != 0:
            set = [atom1_ID, atom1_bond3, atom1_bond3_bond3]
            atom1_bond3_bond3_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond3_bond3))
            type_atom1_bond3_bond3 = df_tablo["Type"][atom1_bond3_bond3_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond3, type_atom1_bond3_bond3), file=open("file.txt", "a"))
            # print(set)
        if atom1_ID != atom1_bond3_bond4 and atom1_bond3_bond4 != 0:
            set = [atom1_ID, atom1_bond3, atom1_bond3_bond4]
            atom1_bond3_bond4_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond3_bond4))
            type_atom1_bond3_bond4 = df_tablo["Type"][atom1_bond3_bond4_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond3, type_atom1_bond3_bond4), file=open("file.txt", "a"))
            # print(set)

    atom1_bond4 = df_tablo["bondID_4"][i]
    # bondIDs and atom types of 1,2,3 and 4 for atom1_bond4 were defined respectively.
    if atom1_bond4 != 0:

        atom1_bond4_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond4))
        atom1_bond4_ID = df_tablo["ID"][atom1_bond4_index]
        atom1_bond4_bond1 = df_tablo["bondID_1"][atom1_bond4_index]
        atom1_bond4_bond2 = df_tablo["bondID_2"][atom1_bond4_index]
        atom1_bond4_bond3 = df_tablo["bondID_3"][atom1_bond4_index]
        atom1_bond4_bond4 = df_tablo["bondID_4"][atom1_bond4_index]

        type_atom1_bond4 = df_tablo["Type"][atom1_bond4_index]
        # If the desired conditions are satisfied, atom types are combined as [atom at i'th row, bondID4 at'ith row, and 4 bondIDs respectively at the row which is equal to bondID4's row ]
        if atom1_ID != atom1_bond4_bond1 and atom1_bond4_bond1 != 0:
            set = [atom1_ID, atom1_bond4, atom1_bond4_bond1]
            atom1_bond4_bond1_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond4_bond1))
            type_atom1_bond4_bond1 = df_tablo["Type"][atom1_bond4_bond1_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond4, type_atom1_bond4_bond1), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom1_bond4_bond2 and atom1_bond4_bond2 != 0:
            set = [atom1_ID, atom1_bond4, atom1_bond4_bond2]
            atom1_bond4_bond2_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond4_bond2))
            type_atom1_bond4_bond2 = df_tablo["Type"][atom1_bond4_bond2_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond4, type_atom1_bond4_bond2), file=open("file.txt", "a"))
            # print(set)

        if atom1_ID != atom1_bond4_bond3 and atom1_bond4_bond3 != 0:
            set = [atom1_ID, atom1_bond4, atom1_bond4_bond3]
            atom1_bond4_bond3_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond4_bond3))
            type_atom1_bond4_bond3 = df_tablo["Type"][atom1_bond4_bond3_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond4, type_atom1_bond4_bond3), file=open("file.txt", "a"))
            # print(set)
        if atom1_ID != atom1_bond4_bond4 and atom1_bond4_bond4 != 0:
            set = [atom1_ID, atom1_bond4, atom1_bond4_bond4]
            atom1_bond4_bond4_index = (df_tablo.set_index('ID').index.get_loc(atom1_bond4_bond4))
            type_atom1_bond4_bond4 = df_tablo["Type"][atom1_bond4_bond4_index]
            print("{}{}{}".format(type_atom1, type_atom1_bond4, type_atom1_bond4_bond4), file=open("file.txt", "a"))
            # print(set)


    print(i,".step" )
    print(time.time() - start_time, "seconds")


    i = i + 1

print("#timestep", t, file=open("file.txt", "a"))
print("#timestep", t)

df_veri = pd.read_table('file.txt', comment="#", header=None)
df_veri.columns = ["timestep %d" % (t)]
#Created a dictionary that corresponds to type of bonds
df_veri["timestep %d" % (t)] = df_veri["timestep %d" % (t)].astype(str).replace(
    {'314': 'NCO', '312': 'NCH', '412': 'OCH', '214': 'HCO', '431': 'ONC', '414': 'OCO', '212': 'HCH',
     '344': 'NOO', '343': 'NON', '441': 'OOC', '144': 'COO', '421': 'OHC', '434': 'ONO', '444': 'OOO', '121': 'CHC',
     '141': 'COC'
     })
# To calculate the number of  3-atom combinations
ndf = df_veri.apply(pd.Series.value_counts).fillna(0)
ndfy = pd.DataFrame(ndf)
ndfy1 = ndfy.transpose()

# To write the number of 3-atom combinations in first timestep with headers and else without headers.
if firstTime == []:
    ndfy1.to_csv('filename8.csv', mode='a', header=True)
    firstTime.append('Not Empty')
else:

    ndfy1.to_csv('filename8.csv', mode='a', header=False)

t = t + 1

This is a typical output file of my code in csv format

Although the code works, it is not efficient since;

  • It can only iterate over 4 bond atoms for each atom ID (However, simulation results can reach up to 12 bond atoms that should have been counted.)

  • The program works slow. (I work with more than 50000 atoms which can take up to 88 minutes for calculation of each timestep.)

Could you please recommend me a more efficient way? Since I am a newbie in programming, I don't know whether there are any other iteration tools of python or packages that could work for my case. I believe it will be more productive if I could perform those operations with less line of code (Particularly if I could get rid of repeating if statements).

Thanks for your time.

1

There are 1 best solutions below

0
On BEST ANSWER

I'm not sure I understood the logic, see if this helps.

For 100000 trios it took 41 seconds.

loc, get_loc are very expansive actions, so instead put your table in a dictionary and instead of validation that everything is unique, put it in a set

import pandas as pd
import random
from collections import defaultdict as dd
from collections import Counter
import time

# create 100000 unique trios of numbers
ids = list(range(50000))
trios_set = set()
while len(trios_set)<100000:
    trio = random.sample(ids,3)
    trios_set.add(frozenset(trio))

ids_dict = dd(list)  # a dictionery where id is the key and value is all the id who are partner with it in a list

for s in trios_set:
    for id in s:
        for other_id in s:
            if id!= other_id:
                ids_dict[id].append(other_id)

ids_dict = dict(ids_dict)
for_df = []
type_list = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n"] 
for id in ids_dict:
    massage = {}
    massage["id"] = id
    other_id_index = 1
    for other_id in ids_dict[id]:
        massage["id_"+str(other_id_index)] =  other_id
        other_id_index+=1
    massage["type"] = random.choice(type_list)
    for_df.append(massage)

df = pd.DataFrame(for_df) # a table with id column and all ids who are with it in trios in id_1 id_2.. and type column with a letter

#------------------------------------------------------------------
#till here we built the input table
start_time = time.time() #till here we build the input table, now check the time for 100000 atoms
type_dict = {}
from_df = dd(set)

for i,r in df.iterrows(): #move the dataframe to a dict of id as key and value as list of ids who connected to it
    for col in df:
        if "id_"in col and str(r[col])!="nan":
            from_df[r["id"]].add(r[col])
    type_dict[r["id"]] = r["type"] #save the type of id in a dictionery

from_df = dict(from_df)
out_trio_set = set() 

for id in from_df:
    for other_id in from_df[id]:
        if other_id!= id and str(other_id)!="nan":
            for third_id in from_df[other_id]:
                current_trio = frozenset([id, other_id,third_id])
                if len(current_trio)==3:
                    out_trio_set.add(current_trio)

type_conter = Counter()

for trio in out_trio_set:
    type_list = []
    for id in trio:
        type_list.append(type_dict[id])
    type_list = sorted(type_list)
    atom_type = "".join(type_list)
    type_conter[atom_type] +=1

out_df = pd.DataFrame(type_conter, index = [1]) # in here put index as timestamp

out_df.to_excel(r"D:\atom.xlsx")
print("--- %s seconds ---" % (time.time() - start_time))