How to print the best matching hit in the BLAST search? / BioPython

673 Views Asked by At

I'm trying to making a BLAST search with a nucleotide sequence and print the best matching hit but not sure about which option/command should I use. There are options like max_hpsp and best_hit_overhang. I don't have an idea about their differences and I want to print just 1 hit. (best matching one) Should i use max_hpsp 1?

I wrote this code but it's still not useful. If you could tell me, where I am mistaken and what should to do, I would be very appreciated :) Thank you!

from Bio.Blast import NCBIWWW
seq = Seq("GTTGA......CT")
def best_matching_hit(seq):
    try:
        result_handle = NCBIWWW.qblast("blastn", "nt", seq)
    except:
        print('BLAST run failed!')
        return None
    blast_record = NCBIXML.read(result_handle)
    for hit in blast_record.alignments:
        for hsp in hit.hsps:
            if hsp.expect == max_hsps 1:
                print(hit.title)
                print(hsp.sbjct)

    
best_matching_hit(seq)
2

There are 2 best solutions below

0
On BEST ANSWER

this returns just one hit , the first one I suppose, as per

Limiting the number of hits in a Biopython NCBIWWW Search on Biostars:


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  7 15:28:11 2021

@author: Pietro


https://stackoverflow.com/questions/67872118/how-to-print-the-best-matching-hit-in-the-blast-search-biopython

"""


from Bio.Blast import NCBIWWW

from Bio.Seq import Seq

seq = Seq("ATGGCGTGGAATGAGCCTGGAAATAACAACGGCAACAATGGCCGCGATAATGACCCTTGGGGTAATAA\
          TAATCGTGGTGGCCAGCGTCCTGGTGGCCGAGATCAAGGTCCGCCAGATTTAGATGAAGTGTTCAACAA\
          ACTGAGTCAAAAGCTGGGTGGCAAGTTTGGTAAAAAAGGCGGCGGTGGTTCCTCTATCGGCGGTGGCGG\
          TGGTGCAATTGGCTTTGGTGTCATTGCGATCATTGCAATTGCGGTGTGGATTTTCGCTGGTTTTTACAC\
          CATCGGTGAAGCAGAGCGTGGTGTTGTACTGCGTTTAGGTAAATACGATCGTATCGTAGACCCAGGCCT\
          TAACTGGCGTCCTCGTTTTATTGATGAATACGAAGCGGTTAACGTACAAGCGATTCGCTCACTACGTGC\
          ATCTGGTCTAATGCTGACGAAAGATGAAAACGTAGTAACGGTTGCAATGGACGTTCAATACCGAGTTGC\
          TGACCCATACAAATACCTATACCGCGTGACCAATGCAGATGATAGCTTGCGTCAAGCAACAGACTCTGC\
          GCTACGTGCGGTAATTGGTGATTCACTAATGGATAGCATTCTAACCAGTGGTCGTCAGCAAATTCGTCA\
          AAGCACTCAAGAAACACTAAACCAAATCATCGATAGCTATGATATGGGTCTGGTGATTGTTGACGTGAA\
          CTTCCAGTCTGCACGTCCGCCAGAGCAAGTAAAAGATGCGTTTGATGACGCGATTGCTGCGCGTGAGGA\
          TGAAGAGCGTTTCATCCGTGAAGCAGAAGCTTACAAGAACGAAATCTTGCCGAAGGCAACGGGTCGTGC\
          TGAACGTTTGAAGAAGGAAGCTCAAGGTTACAACGAGCGTGTAACTAACGAAGCATTAGGTCAAGTAGC\
          ACAGTTTGAAAAACTACTACCTGAATACCAAGCGGCTCCTGGCGTAACACGTGACCGTCTGTACATTGA\
          CGCGATGGAAGAGGTTTACACCAACACATCTAAAGTGTTGATTGACTCTGAATCAAGCGGCAACCTTTT\
          GTACCTACCAATCGATAAATTGGCAGGTCAAGAAGGCCAAACAGACACTAAACGTAAATCGAAATCTTC\
          TTCAACCTACGATCACATTCAACTAGAGTCTGAGCGTACACAAGAAGAAACATCGAACACGCAGTCTCG\
          TTCAACAGGTACACGTCAAGGGAGATACTAA")

def best_matching_hit(seq):
    try:
        result_handle = NCBIWWW.qblast("blastn", "nt", seq,  hitlist_size=1)
    except:
        print('BLAST run failed!')
        return None
    blast_record = result_handle.read()
   
    print(blast_record)


best_matching_hit(seq)

0
On

you're trying to compare the E-value (hsp.expect) with an undefined variable max_hsps, which isn't necessary for retrieving the best hit. also, to limit the number of hits, you should use the hitlist_size parameter in NCBIWWW.qblast instead of max_hsps. I've provided a corrected version of your function that properly retrieves and prints the information of the best hit, or indicates if no hits are found.

from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio.Seq import Seq

def best_matching_hit(seq):
    try:
        result_handle = NCBIWWW.qblast("blastn", "nt", seq, hitlist_size=1)
    except Exception as e:
        print('BLAST run failed:', e)
        return None
    
    blast_record = NCBIXML.read(result_handle)
    if blast_record.alignments:
        best_hit = blast_record.alignments[0] # Access the first (best) alignment
        best_hsp = best_hit.hsps[0] ## Access the first high-scoring segment pair
        print("Best Hit Title:", best_hit.title) 
        print("Best Hit Subject Sequence:", best_hsp.sbjct)
        #if u want alignment aswell u can do below one otherwise skip these four steps and continue with else
        print("Alignment:")
        print(best_hsp.query)
        print(best_hsp.match)
        print(best_hsp.sbjct)
    else:
        print("No hits found.")

seq = Seq("ATGGCGTGGAATGAGCCTGGAAATAACAACGGCAACAATGGCCGCGATAATGACCCTTGGGGTAATAA\
          TAATCGTGGTGGCCAGCGTCCTGGTGGCCGAGATCAAGGTCCGCCAGATTTAGATGAAGTGTTCAACAA\
          ACTGAGTCAAAAGCTGGGTGGCAAGTTTGGTAAAAAAGGCGGCGGTGGTTCCTCTATCGGCGGTGGCGG\
          TGGTGCAATTGGCTTTGGTGTCATTGCGATCATTGCAATTGCGGTGTGGATTTTCGCTGGTTTTTACAC\
          CATCGGTGAAGCAGAGCGTGGTGTTGTACTGCGTTTAGGTAAATACGATCGTATCGTAGACCCAGGCCT\
          TAACTGGCGTCCTCGTTTTATTGATGAATACGAAGCGGTTAACGTACAAGCGATTCGCTCACTACGTGC\
          ATCTGGTCTAATGCTGACGAAAGATGAAAACGTAGTAACGGTTGCAATGGACGTTCAATACCGAGTTGC\
          TGACCCATACAAATACCTATACCGCGTGACCAATGCAGATGATAGCTTGCGTCAAGCAACAGACTCTGC\
          GCTACGTGCGGTAATTGGTGATTCACTAATGGATAGCATTCTAACCAGTGGTCGTCAGCAAATTCGTCA\
          AAGCACTCAAGAAACACTAAACCAAATCATCGATAGCTATGATATGGGTCTGGTGATTGTTGACGTGAA\
          CTTCCAGTCTGCACGTCCGCCAGAGCAAGTAAAAGATGCGTTTGATGACGCGATTGCTGCGCGTGAGGA\
          TGAAGAGCGTTTCATCCGTGAAGCAGAAGCTTACAAGAACGAAATCTTGCCGAAGGCAACGGGTCGTGC\
          TGAACGTTTGAAGAAGGAAGCTCAAGGTTACAACGAGCGTGTAACTAACGAAGCATTAGGTCAAGTAGC\
          ACAGTTTGAAAAACTACTACCTGAATACCAAGCGGCTCCTGGCGTAACACGTGACCGTCTGTACATTGA\
          CGCGATGGAAGAGGTTTACACCAACACATCTAAAGTGTTGATTGACTCTGAATCAAGCGGCAACCTTTT\
          GTACCTACCAATCGATAAATTGGCAGGTCAAGAAGGCCAAACAGACACTAAACGTAAATCGAAATCTTC\
          TTCAACCTACGATCACATTCAACTAGAGTCTGAGCGTACACAAGAAGAAACATCGAACACGCAGTCTCG\
          TTCAACAGGTACACGTCAAGGGAGATACTAA")
best_matching_hit(seq)