Darwin: Sequence Searching Facility Version 2.1, August 2000 (c) E.T.H. Zurich # Darwin Sample session (August, 2000) # # Load a small peptide file > DB := ReadDb('enolaseTEST'); Reading 8197 characters from file enolaseTEST Pre-processing input (peptides) 16 sequences within 16 entries considered DB := Peptide file(enolaseTEST(8197), 16 entries, 6637 aminoacids) > # After this command, the enolase database is loaded and # most computations will refer to this database. The name # DB can be used to refer to the database. # # Compute the counts of the amino acids in this database > Count := GetAaCount(DB); Count := [718, 262, 328, 430, 83, 183, 451, 592, 118, 403, 568, 519, 131, 234, 242, 379, 270, 54, 168, 504] > # Print the frequencies in a more readable format > print(Count); [718, 262, 328, 430, 83, 183, 451, 592, 118, 403, 568, 519, 131, 234, 242, 379, 270, 54, 168, 504] # # Now compute a Dayhoff matrix based on these frequencies. > DM := CreateOrigDayMatrix( Mutations1978, Count, 250 ); DM := DayMatrix(Peptide, pam=250, Sim: max=17.581, min=-6.486, max offdiag=7.414, del=-19.814-1.396*(k-1)) # # print it in a more readable format: > print(DM); DayMatrix(Peptide, pam=250, Sim: max=17.581, min=-6.486, max offdiag=7.414, del=-19.814-1.396*(k-1)) C 11.6 S 1.7 1.4 T 0.4 1.1 1.6 P 0.2 1.2 0.8 4.9 A 0.2 1.2 1.2 1.3 2.2 G -0.8 1.1 0.1 -0.1 1.2 4.7 N -1.1 0.7 0.4 -0.1 -0.1 0.2 2.5 D -2.9 -0.0 -0.4 -1.0 -0.3 0.1 2.0 4.4 E -3.0 -0.3 -0.7 -0.5 -0.2 -0.4 1.0 3.4 4.3 Q -2.1 -0.1 -0.4 0.6 -0.4 -1.0 0.8 1.4 2.3 2.9 H -0.4 0.0 -0.3 0.6 -0.7 -1.3 1.5 0.8 0.8 2.3 3.8 R -1.0 0.0 -0.4 0.4 -1.5 -2.3 0.1 -1.7 -1.3 1.5 1.9 6.1 K -2.4 0.0 0.1 -0.5 -1.3 -1.7 1.0 -0.3 -0.4 0.9 0.7 3.5 4.7 M -3.1 -1.8 -0.8 -2.0 -1.7 -3.4 -2.4 -3.9 -3.1 -1.1 -1.7 -0.6 0.3 7.9 I -0.7 -2.0 -0.4 -2.3 -1.4 -3.6 -2.7 -3.9 -3.4 -2.6 -2.4 -2.6 -2.7 1.4 5.6 L -3.4 -2.7 -1.7 -2.2 -2.3 -4.3 -3.1 -4.9 -4.0 -1.6 -1.5 -2.9 -2.9 3.5 1.8 5.9 V -0.1 -1.2 0.0 -1.1 -0.2 -1.8 -2.4 -3.2 -2.8 -2.0 -1.9 -2.7 -2.9 1.2 3.4 1.3 4.4 F -1.1 -2.7 -2.5 -3.6 -3.4 -4.5 -3.2 -6.0 -5.7 -3.7 -0.8 -3.8 -4.8 -0.0 0.7 1.9 -1.5 9.1 Y 2.7 -2.0 -2.0 -3.5 -3.0 -4.6 -1.6 -4.3 -4.2 -2.9 0.6 -3.2 -3.7 -2.4 -1.2 -0.4 -2.4 7.4 10.1 W -3.8 -1.5 -3.7 -3.8 -5.0 -5.9 -3.2 -6.3 -6.5 -3.2 -1.0 3.0 -2.4 -3.8 -5.2 -1.1 -5.7 1.4 1.1 17.6 > # Notice that the Dayhoff command takes two additional arguments: # a set of observed mutations (in this case the original # mutations matrix reported by Dayhoff et al.) and the PAM # distance, as it is usual, 250. # # Compute a subset of the all-against-all matching of the # enolases (first half against first half): > AllAll := GetSelfAlign( DB, DM, 100, 1,1,2 ); # cost/length ratio bound below by 0.800 # Pairs reaching the goal (set at 100): Match(1121,7553): Match(4775,7555): Match(3724,7555): AllAll := [Match(0,1121,7553,0,0,250), Match(0,4775,7555,0,0,250), Match(0,3724,7555,0,0,250)] > # The matchings are described, at this stage, by two offsets # into the amino acid database. # # The matchings are printed as they appear and are also returned # as an array, in this case assigned to the variable AllAll. # These matchings reached the goal of 100 (third parameter) # according to the Dayhoff matrix passed as second argument. # # Running the dynamic programming algorithm (Needleman & Wunsch) # up to a point where a maximum is found is called "GlobalAlign" > m := GlobalAlign(AllAll[1],DM); m := Match(293.6,1121,7553,121,120,250) > # Now print this refined match: > print(m); lengths=121,120 simil=293.6, PAM_dist=250, offsets=1121,7553, identity=62.3% DE=P1;C35948A23850 Enolase (EC 4.2.1.11), skeletal muscle - Chicken DE=P1;G35268G17551b E.coli pyrG (complete cds) and eno (partial cds) mRNAs encoding CTP synthetase and enolase, respectively. IQKIHAREILDSRGDPTVEVDLHTAKGHF_RAAVPSGASTGIHEALELRDGDKKRFLGKGVLKAVEHINKTIGPALIEKK |.||.:|||:||||:||||.::|...|.. .||.|||||||.:||||||||||:|||||||.|||..:|.:|::|||.|. IVKIIGREIIDSRGNPTVEAEVHLEGGFVGMAAAPSGASTGSREALELRDGDKSRFLGKGVTKAVAAVNGPIAQALIGKD ISVVEQEKIDKVVIEMDGTENKSKFGANAILGVSLAVSHAGA .: :|..|||::|::||||:||||||||||:||||.::|:| AK__DQAGIDKIMIDLDGTEKKSKFGANAILAVSLANAKAAA > # A matching is printed aligned, and the middle line contains # characters which make it easier to estimate the quality of the # similarity. A "|" indicates an exact match, an "!" # indicates a very likely mismatch, a ":" a likely mismatch, # a "." an unlikely mismatch, and a space a very unlikely # mismatch. # # Running the dynamic programming from left to right and then # right to left until it stabilizes usually gives a better # alignment. This is called " LocalAlign": > print( LocalAlign(m,DM)); lengths=122,121 simil=293.6, PAM_dist=250, offsets=1120,7552, identity=61.8% DE=P1;C35948A23850 Enolase (EC 4.2.1.11), skeletal muscle - Chicken DE=P1;G35268G17551b E.coli pyrG (complete cds) and eno (partial cds) mRNAs encoding CTP synthetase and enolase, respectively. SIQKIHAREILDSRGDPTVEVDLHTAKGHF_RAAVPSGASTGIHEALELRDGDKKRFLGKGVLKAVEHINKTIGPALIEK :|.||.:|||:||||:||||.::|...|.. .||.|||||||.:||||||||||:|||||||.|||..:|.:|::|||.| KIVKIIGREIIDSRGNPTVEAEVHLEGGFVGMAAAPSGASTGSREALELRDGDKSRFLGKGVTKAVAAVNGPIAQALIGK KISVVEQEKIDKVVIEMDGTENKSKFGANAILGVSLAVSHAGA ..: :|..|||::|::||||:||||||||||:||||.::|:| DAK__DQAGIDKIMIDLDGTEKKSKFGANAILAVSLANAKAAA > # The following statement creates 300 Dayhoff matrices, one # for every possible PAM distance between 1 and 300. > DMS := CreateOrigDayMatrix( Mutations1978, Count, 1..300 ): > # Compare the values printed for the first Dayhoff matrix with # the values for PAM=20. The smaller the PAM distance, the # higher the penalty for mismatches (negative off-diagonal # entries and deletion penalties) > print(DMS[20]); DayMatrix(Peptide, pam=20, Sim: max=20.628, min=-27.969, max offdiag=3.596, del=-27.968-1.396*(k-1)) C 18.4 S -0.9 10.8 T -8.2 1.3 12.3 P -8.4 -1.9 -5.3 13.5 A -8.1 -2.0 -1.7 -3.2 8.8 G -11.7 -3.4 -9.5 -9.8 -4.7 9.9 N -15.1 -1.3 -3.5 -9.6 -8.6 -6.8 11.8 D -21.2 -7.8 -8.7-14.0 -8.7 -7.8 -0.7 11.0 E -21.2 -8.7-10.8 -9.8 -6.9 -9.4 -7.3 -0.0 10.8 Q -17.7 -7.5 -7.4 -3.2 -7.5-11.0 -6.3 -5.7 0.5 14.1 H -6.2 -7.6 -8.8 -3.9-11.0-13.0 0.9 -6.3 -8.0 3.6 16.1 R -9.1 -5.0-10.1 -5.8-12.8-15.9-10.7-19.2-17.0 -2.4 -1.2 13.3 K -19.5 -6.6 -4.8-10.0-12.7-12.4 -3.5 -9.6 -9.2 -4.4 -8.7 -0.7 10.4 M -20.8 -9.9 -7.2-13.7-10.8-15.5-18.1-22.3-14.4 -7.0-16.9 -8.7 -4.9 16.2 I -8.8-13.2 -5.6-15.6-11.6-21.8-11.6-16.2-13.0-14.4-15.5-11.3-12.4 -5.6 11.5 L -21.8-13.5-10.4-10.8-11.9-17.7-12.7-23.4-16.9 -7.6 -8.3-14.5-14.0 -1.7 -5.4 10.2 V -7.0-11.1 -5.0 -9.6 -6.1-10.6-15.1-15.9-13.3-11.0 -9.2-13.6-16.1 -5.3 -1.1 -5.7 10.5 F -17.4 -9.7-13.1-14.5-14.5-14.8-14.9-25.9-25.1-20.0 -7.5-14.8-23.3 -8.3 -6.2 -4.9-14.7 14.0 Y -2.7-10.3 -9.1-20.5-13.3-22.7 -7.9-21.1-14.6-17.5 -3.3-16.0-14.8-20.3-12.3-11.4-12.6 2.1 15.4 W -20.6 -7.2-19.0-20.6-22.4-24.0-13.2-26.0-28.0-19.3 -9.1 -3.2-19.4-22.0-24.3 -9.5-25.6 -6.8 -7.5 20.6 > # Now find the most likely PAM number for the previous match: > FindBestPam(m,DMS); Match(542.2,1121,7553,121,120,53.6153,71.4636) > # The last argument of the Match, the PAM number, indicates # the time since divergence in PAM units. # # We can also search for an arbitrary string in the database: > String( CaseSearchString( 'LPVPAFNVINGGSHA', DB[string] )); String(211) > # or searching for all the occurrences of an amino acid sequence: > String( SearchDb( 'SFRDA' )); String(Entry(1,6,13,16)) #String( SearchSeqDb( 'AV' )); > # To see all the selectors for DB type "?database" # # Find the longest repetition within the database: > FindLongestRep(DB); The longest repetition is 167 chars long Match(0,8019,2939,167,167) > # Darwin also allows us to do arithmetic and general programming, # for example: > a := 1; a := 1 > x := 0.01; x := 0.01000000 > for b from 10 by 5 to 20 do printf( '(a+x)^%d = %g\n', b, (a+x)^b ) od; (a+x)^10 = 1.10462 (a+x)^15 = 1.16097 (a+x)^20 = 1.22019 > > A := [[1,0,0],[0,2,1],[1,0,3]]; A := [[1, 0, 0], [0, 2, 1], [1, 0, 3]] > print(A*A); 1 0 0 1 4 5 4 0 9 > GaussElim( A, [1,1,1] ); [1, 0.5000, 0] > A * "; [1, 1, 1] > SqrtA := exp(ln(A)/2); SqrtA := [[1, 0, 0], [-0.04818816, 1.4142, 0.3178], [0.3660, 0, 1.7321]] > print(SqrtA*SqrtA); 1.000000000 0.000000000 0.000000000 0.000000000 2.000000000 1.000000000 1.000000000 0.000000000 3.000000000 > # How much CPU time did all of this take? > time(); 1.2800 > #Now, to end the session you can type "done" yourself. #When you do, some data will be displayed. This describes #the internal workings of the alignment algorithms, and then #finally, after every darwin session, the total cpu time used. # #Happy Darwining! > done; 207409 calls to self_match function 47874 pairs eliminated due to 1 previous character matching 7199 pairs eliminated due to 2 previous characters matching 3422 pairs eliminated due to 3 previous characters matching 1651 pairs eliminated due to 4 previous characters matching 542 pairs eliminated by backwards dynamic programming 80556 pairs analyzed as single pairs 3 pairs reached the goal 1.28 secs of cpu used