Answer To: Microsoft Word - BLAST Algorithm Steps - Part 2 .docx 09-April-2018 Project (Step 1 out 2) BLAST...
Abr Writing answered on May 12 2020
/*
* BLAST - Search two DNA sequences for locally maximal segment pairs. The basic
* command syntax is
*
* BLAST sequence1 sequence2
*
* where sequence1 and sequence2 name files containing DNA sequences. Lines
* at the beginnings of the files that don't start with 'A', 'C', 'T' or 'G'
* are discarded. Thus a typical sequence file might begin:
*
* >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end.
* GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
* GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
*
* If sequence1 and sequence2 are identical file names, then matches are
* computed without "mirror image" pairs and without the trivial long match.
*
* The BLAST command permits optional additional arguments, such as X=50,
* that reset certain internal parameters. The available parameters are:
*
* M or m gives the score for a match.
* I or i gives the score for a transition (A <--> G, C <--> T).
* V or v gives the score for a tranversion (non-transition change).
* W or w gives the word size.
* X or x gives the value for terminating word extensions.
* K or k give the cutoff.
* (defaults: M = 1, I = -1, V = -1, W = 8, X = 10*M,
* and K = 5% significance level)
*
* If W is set to 0, then all locally maximum segment pairs (LMSPs) are
* computed; in this case the value of X is irrelevant.
*
* In addition, the word "noalign" requests that no alignments be printed
* (summary statistics for each locally maximal segment pair are reported)
* and the word "stats" requests that statistics concerning the performance
* of BLAST be printed. Thus a typical command is
*
* BLAST SEQ1 SEQ2 M=1 I=0 V=-1 noalign
*/
#include
#include
#include
#define max(x,y) ((x > y) ? (x) : (y))
#define min(x,y) ((x < y) ? (x) : (y))
#define SUBSTITUTION_SCORE(x,y) S[x][y]
#define DEFAULT_M 1 /* default score for match */
#define DEFAULT_I -1 /* default score for transition */
#define DEFAULT_V -1 /* default score for transversion */
#define DEFAULT_W 8 /* default word size */
#define DEFAULT_SIG 0.05 /* default significance level for setting K */
char
S[128][128],
*s1, *seq1, *s2, *seq2;
int
alignments,
print_stats,
K, /* cutoff score */
M, /* score for a match */
I, /* score for a transition */
V, /* score for a transversion */
W, /* word length */
X, /* cutoff for hit extensions */
sig99, sig90, sig50, /* number of MPSs above given significance */
numMSPs, /* total number of MSPs recorded */
numhits, /* number of word hits */
missW[15][3],
missX[25][3];
double
param_K, param_lambda;
long
*diag_lev, /* extent of discovered matches on a given diagonal */
len1, len2; /* sequence lengths */
typedef struct msp {
long len, pos1, pos2;
int score;
struct msp *next_msp;
} Msp, *Msp_ptr;
Msp_ptr msp_list, *msp;
main(argc, argv)
int argc;
char *argv[];
{
int i, v;
double x, log(), sqrt(), significance();
char *ckalloc();
if (argc < 3)
fatalf("%s seq1 seq2 [M=] [I=] [V=] [K=] [W=] [X=] [noalign] [stats]",
argv[0]);
M = DEFAULT_M;
I = DEFAULT_I;
V = DEFAULT_V;
W = DEFAULT_W;
X = -1;
alignments = 1;
print_stats = 0;
while (--argc > 2) {
if (strcmp(argv[argc],"noalign") == 0) {
alignments = 0;
continue;
}
if (strcmp(argv[argc],"stats") == 0) {
print_stats = 1;
continue;
}
if (argv[argc][1] != '=')
fatalf("argument %d has improper form", argc);
v = atoi(argv[argc] + 2);
switch (argv[argc][0]) {
case 'M':
case 'm':
M = v;
break;
case 'I':
case 'i':
I = v;
break;
case 'V':
case 'v':
V = v;
break;
case 'K':
case 'k':
K = v;
break;
case 'W':
case 'w':
W = v;
break;
case 'X':
case 'x':
X = v;
break;
default:
fatal("Options are M, I, V, K, W, X.");
}
}
if (X == -1)
X = 10*M;
if (V > I)
fatal("transversions score higher than transitions?");
len1 = get_seq(argv[1], &seq1);
if (strcmp(argv[1],argv[2]) == 0) {
if (W == 0)
fatal("file1 = file2 not implemented when W=0");
seq2 = seq1;
len2 = len1;
} else
len2 = get_seq(argv[2], &seq2);
s1 = seq1 - 1; /* so subscripts start with 1 */
s2 = seq2 - 1;
diag_lev = (long *)ckalloc((len1+len2+1)*sizeof(long)) + len1;
/* set up scoring matrix */
S['A']['A'] = S['C']['C'] = S['G']['G'] = S['T']['T'] = M;
S['A']['G'] = S['G']['A'] = S['C']['T'] = S['T']['C'] = I;
S['A']['C'] = S['A']['T'] = S['C']['A'] = S['C']['G'] =
S['G']['C'] = S['G']['T'] = S['T']['A'] = S['T']['G'] = V;
get_params();
if (K == 0) {
x = log(DEFAULT_SIG)/(-param_K*len1*len2);
K = 2*log(sqrt(x))/(-param_lambda);
while (significance(K-1) >= DEFAULT_SIG)
--K;
while (significance(K) < DEFAULT_SIG)
++K;
}
if (alignments) {
printf("#:lav\n\n");
printf("d {\n\"BLAST output with parameters:\n");
printf("M = %d, I = %d, V = %d\n", M, I, V);
if (W > 0)
printf("W = %d, X = %d\"", W, X);
else
printf("W = 0 (computes all LMSPs)\"");
printf("\n}\n");
printf("s {\n \"%s\" 1 %d\n \"%s\" 1 %d\n}\n",
argv[1], len1, argv[2], len2);
printf("k {\n \"significance\"\n}\n");
}
if (W > 0) {
bld_table();
search();
} else
get_msps();
sort_msps();
if (!alignments)
printf("\n\n Seq. 1 Seq. 2 len score signif W X\n");
/* print the matches for each sequence */
for (i = 0; i < numMSPs; ++i)
print_msp(i);
if (W == 0 && print_stats) {
printf("\n%3d MSPs above significance level 0.99\n", sig99);
printf("%3d MSPs above significance level 0.90\n", sig90);
printf("%3d MSPs above significance level 0.50\n", sig50);
printf("\nPercent of MSPs missed by BLAST at various");
printf(" W and significance levels:\n");
printf(" W: 0.99 0.90 0.50\n");
for (i = 4; i <= 12; ++i)
printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n",
i,
100*missW[i][0]/((float)sig99),
100*missW[i][1]/((float)sig90),
100*missW[i][2]/((float)sig50));
printf("\nPercent of MSPs missed by BLAST at various");
printf(" X and significance levels:\n");
printf(" X: 0.99 0.90 0.50\n");
for (i = 1; i <= 20; ++i)
printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n",
i,
100*missX[i][0]/((float)sig99),
100*missX[i][1]/((float)sig90),
100*missX[i][2]/((float)sig50));
}
if (W > 0 && print_stats) {
printf("\n\nStatistics:\n");
printf(" %s: M = %d, I = %d, V = %d, K = %d, W = %d, X = %d\n",
argv[0], M, I, V, K, W, X);
printf(" # of word hits = %d\n", numhits);
printf(" # of matches found = %d\n", numMSPs);
}
if...