############################################################## # constants to change ############################################################## # paths to programs rnalocmin = "~/coding/RNAlocmin/bin/RNAlocmin" rnasubopt = "RNAsubopt" gfold = "~/software/gfold/src/gfold" # needed only for -k/--pknots option bhgbuilder = "~/coding/DSUeval/bin/BHGbuilder" # needed only for --bhg option rnafold = "RNAfold" # params - parameters to pass to both RNA(subopt, locmin) programs # usually just "-d x" for dangling ends, or "-P path.par" for parameter file # params_gen , params_lm -- parameters for sructure generating program (RNAsubopt / gfold) params = "-d0 -P ~/software/ViennaRNA-2.1.1/misc/rna_turner1999.par" params_gen = "" params_lm = "" # maximal xi to precompute lowest_xi = 1.0 maximal_xi = 10.0 # shall the script leave all generated files? leave_intermeds = True ############################################################## # INIT -- changes from now on on your own risk! ############################################################## import collections as c import sys import argparse import time from subprocess import call, PIPE, Popen import math import os.path # time of beginning time_local = c.defaultdict(float) time_local[0] = time.time() #arguments: parser = argparse.ArgumentParser(description='Script to execute the adaptive LM searching in the RNA folding landscape.', usage='First change the paths to programs in the beginning of %(prog)s, then run "python %(prog)s -I [options]"') parser.add_argument("-i", "--iterations", action="store", type=int, default=10, help="Number of iterations in the adaptive searching step. (Equals number of changes of scaling temperature.)") parser.add_argument("-I", "--infile", action="store", type=str, required=True, help="Input file with a sequence in FASTA format.") parser.add_argument("-c", "--convergence", action="store", type=float, default=0.1, help="Convergence parameter. Use values between 0.1 and 0.8. Higher values shifts the searching into higher energies.") parser.add_argument("-s", "--samples", action="store", type=int, default=10000, help="Number of samples to generate in each iteration.") parser.add_argument("-b", "--bhg", action="store", type=int, default=0, help="Number of non-shallow minima to keep for BHG generation. 0 means BHG is not generated.") parser.add_argument("--shift", action="store_true", help="Use the shift move set when discarding shallow minima.") parser.add_argument("-e", "--erange", action="store", type=float, default=None, help="Energy range (in kcal/mol) from MFE in which we are interested in, do not compute further.") parser.add_argument("-k", "--pknots", action="store_true", help="Use pseudoknots? WARNING: MUCH slower! only up to 100nt recommended.") parser.add_argument("--use-lmns", action="store_true", help="Use old .lmns file if found") parser.add_argument("--skip-run", action="store_true", help="Skip the main program run? Just do the BHG generation.") parser.add_argument("--shallow-start", action="store", type=float, default=1.0, help="When discarding shallow minima, start at this value of shallowness. (in kcal/mol)") parser.add_argument("--not-mfe", action="store_true", help="Do not append MFE (without pseudoknots) structure to the BHG computation. Appending requires RNAfold to be installed.") #parser.add_argument("-outfile", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output file -- ") #parser.add_argument("-I", '--infile', action="store", type=argparse.FileType('r'), default=sys.stdin, help="Input file -- sequence in FASTA format.") args = parser.parse_args() # pseudoknots if args.pknots: rnasubopt = gfold params_lm += " -k" #scaling: scaling_param = "--betaScale=" ############################################################## # PROGRAM ############################################################## def get_prefix(filename): return '.'.join(filename.split('.')[:-1]) def getLTime(timer = 0): """ Gets time from last call of getLTime(). @param timer - int - which timer @return float - time in secs """ global time_local if timer not in time_local: time_local[timer] = time_start res = time.time() - time_local[timer] time_local[timer] = time.time() return res def frange(start, stop, step): """ Iterable through floats. @param start - float - start of iteration @param stop - float - stop of iteration @param step - float - step of iteration @return float - next iteration """ assert step > 0.0 total = start compo = 0.0 while total < stop: yield total y = step - compo temp = total + y compo = (temp - total) - y total = temp def num_lines(fil): """ Finds out the number of lines in file fil. @param fil - string - filename @return int - number of lines in file """ if not os.path.isfile(fil): return 0 p1 = Popen("wc -l < "+fil, shell=True, stdout=PIPE) return int(p1.stdout.read()) def print_dict(d, name=None): """ Prints a directory to output in sorted, nice format @param d - dict(any) - directory to print @param name - string - name of directory """ if name: print >> sys.stderr, name for a in sorted(d): print >> sys.stderr, a, d[a] def read(lmfile, highest): """ Reads file in lmfile to info structure; @param lmfile - string - filename to read @param highest - float - highest energy to think of @return info structure with histogram information """ info = c.defaultdict(list) with open(lmfile, "r") as f: for i, line in enumerate(f): if i==0: seq = line else: (num, struct, energy, count) = line.strip().split() num = int(num) energy = float(energy) count = int(count) if not highest or energy <= highest: info[energy].append(count) return info def extract_bj(info, energy, cumulative = False): """ Convert info to beta_j^{ energy: continue #print info[en] for a in info[en]: arr[a]+=1 if cumulative: for i in range(len(arr)+1): if i==0: arr[i] = 0 else: arr[i] = arr[i-1] + arr[i] return arr def en_Sahoo(info, cov_param=0.1, summed=True): """ Returns first energy where coverage criterion is not met @param info - info structure - structure with histogram information @param cov_param - float - coverage parameter @param summed - bool - use sum version of convergence parameter? (leave it yes, non-summed is crap) @return float - energy of first non-covered energy """ if not cov_param: return None bjs = c.defaultdict(lambda: c.defaultdict(int)) for en in info: for j in info[en]: bjs[en][j] += 1 bjs[en][0] += 1 # sum # add energies last_en = None en = None for en in sorted(bjs): if last_en: for j in sorted(bjs[last_en]): bjs[en][j] += bjs[last_en][j] last_en = en for en in sorted(bjs): #check only the energies that are dividable by 0.1 without residue, due to their abundancy if int(en*10000.0)%1000!=0: continue #print en #finally check the energy if bjs[en][1]==0 or (summed and bjs[en][1] "+prefix+"tmp.tmp" print >> sys.stderr, to_call call(to_call, shell=True) etoxi = read_xi(seq_file, etoxi, prefix+"tmp.tmp", xi) call("rm "+prefix+"tmp.tmp", shell=True) return etoxi def read_xi(seq_file, etoxi, xi_file, xi): """ Reads the file of generated structures, extracts energy and writes a new value into etoxi table. @param seq_file - string - filename where sequence is stored @param etoxi - dict - energy to xi dictionary to expand @param xi_file - string - filename, where structures are stored @param xi - float - xi for which these structures were generated @return dict - updated etoxi table """ # crawl through the file summ = 0.0 count = 0 #print etoxi, xi_file, xi #first make sure the file has all the energies and it is LM file prefix = get_prefix(seq_file) to_call = rnalocmin+" "+params+" "+params_lm+" -s "+seq_file+" < "+xi_file+" > "+prefix+"tmp.lm" #print to_call call(to_call, shell=True) if xi in etoxi: return etoxi with open(prefix+"tmp.lm", "r") as f: for a in f: split = a.split() if len(split) < 2: continue #find the energy: #print split for a in split[1:]: try: en = float(a) except ValueError: continue summ += en count += 1 #print en break # fill the etoxi dict if count!=0: etoxi[summ/float(count)] = xi if not leave_intermeds: call("rm "+prefix+"tmp.lm", shell=True) # throw out the maximum value -- unreliable return etoxi def get_xi(etoxi, energy, rescale=True): """ Get best xi for asked energy. @param etoxi - output from get_xis @param energy - float - energy of interest @param rescale - bool - if true use two closest energy and rescale linearly if false use just xi for closest energy @return float - approximate best xi for desired energy range """ #for a in sorted(etoxi): #print a, etoxi[a] #print energy last_en = -1e10 last_xi = 0.0 for en in sorted(etoxi): if energy < en: if rescale: if last_en == -1e10: return etoxi[en] else: return round(((energy-last_en)/(en-last_en)*(etoxi[en]-etoxi[last_en])+etoxi[last_en])*1000)/1000.0 else: if abs(energy-last_en) "+mfe_file print >> sys.stderr, to_call call(to_call, shell=True) with open(mfe_file, "r") as f: split = f.readlines()[2].split() mfe = split[0] energy_mfe = "".join(split[1:]) #print mfe, energy_mfe return mfe, energy_mfe[1:-1] def run_RNAlocmin_once(seq_file, xi, samples, minima=1000000, erange=None, lm_file=None): """ Run one iteration of adaptive sampling and LM finding @param seq_file - string - file for sequence file @param xi - float - xi scale for sampling @param samples - int - number of samples @param erange - float - range of energy we are interested in @param lm_file - string - if not first run - lm_file is available, use it @return string, string - filename, where new lm_file is stored, filename, where the samples produced as stored """ prefix = get_prefix(seq_file) if erange!=None: er = " --eRange=="+erange else: er = "" global iteration_num to_call = rnasubopt+" "+params+" "+params_gen+" -p "+str(samples)+" "+scaling_param+str(xi)+" < "+seq_file+" > "+prefix+"."+str(xi) + " 2> "+prefix+"tmp.err" print >> sys.stderr, to_call call(to_call, shell=True) if lm_file == None: to_call = rnalocmin+" "+params+" "+params_lm+" -s "+seq_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp" + " 2> "+prefix+"tmp.err" else: to_call = rnalocmin+" "+params+" "+params_lm+" -p "+lm_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp" + " 2> "+prefix+"tmp.err" print >> sys.stderr, to_call call(to_call, shell=True) call("mv "+prefix+".lmtmp "+prefix+".lm", shell=True) if not leave_intermeds: call("rm "+prefix+"."+str(xi)+" "+prefix+"tmp.err", shell=True) return prefix+".lm", prefix+"."+str(xi) def run_adaptive(seq_file, x=0.1, iterations=10, samples_per_run=10000, erange=None): """ Runs LM searching with adaptive xi scaling @param seq_file - string - filename with sequence @param x - float - criterion parameter @param iterations - int - max. how many iterations @param erange - float - energy range we are interested in (if None - just do max iterations) @param samples_per_run - int - samples per iteration @return creates file with LM in erange from mfe to sequence seq_file """ xis = c.defaultdict(float) lms = c.defaultdict(float) belows = [] time = c.defaultdict(float) multi = x # run first time xi = lowest_xi getLTime() lm_file, xi_file = run_RNAlocmin_once(seq_file, xi, samples_per_run) time_other = getLTime() print >> sys.stderr, "iteration no.", str(0), "in", time_other, "secs." #get minimal energy mfe, mfe_en = get_mfe(seq_file) mfe_en = float(mfe_en) #if we have energy range, set it: if erange: highest = mfe_en + erange else: highest = None # construct xi-to-energy table etoxi = gen_etoxi(seq_file, maximal_xi) etoxi = read_xi(seq_file, etoxi, xi_file, xi) if not leave_intermeds: call("rm "+xi_file, shell=True) print_dict(etoxi) getLTime() i = 1 lms[i*samples_per_run] = num_lines(lm_file)-1 time[i*samples_per_run] = time_other xis[i*samples_per_run] = xi # and iterate until we have reached final enery or maximal number of iterations below = None while (below is None or highest is None or below<=mfe_en+erange) and (iterations is None or i> sys.stderr, "Empty info, corrupted", lm_file, "? Exitting..." exit() #get corresponding xi-scale for the energy xi = get_xi(etoxi, below, True) print >> sys.stderr, "criterion not met at energy:", below, "xi:", xi belows.append(below) # actual run if leave_intermeds: call("cp "+lm_file+" "+lm_file+str(i-1), shell=True) lm_file, xi_file = run_RNAlocmin_once(seq_file, xi, samples_per_run, lm_file=lm_file) tim_tmp = getLTime() print >> sys.stderr, "iteration no.", i-1, "in", tim_tmp, "secs." # update the xi table: etoxi = read_xi(seq_file, etoxi, xi_file, xi) if not leave_intermeds: call("rm "+xi_file, shell=True) print_dict(etoxi) #just keeping the scores time_other += tim_tmp lms[i*samples_per_run] = num_lines(lm_file)-1 time[i*samples_per_run] = time_other xis[i*samples_per_run] = xi return (lms, time, xis) ############################################################## # MAIN body of the program ############################################################## #arguments parsing: seq_file = args.infile cov_par = args.convergence iterations = args.iterations samples = args.samples erange = args.erange #flags: run = not args.skip_run bhg = args.bhg # print what is on: print >> sys.stderr, "#################################################" print >> sys.stderr, seq_file print >> sys.stderr, "#################################################" #countdown for i in range(3,0,-1): print >> sys.stderr, "start in", i time.sleep(1) # first fix the input: with open(seq_file, "r") as f: lines = f.readlines() if len(lines) == 0: print >> sys.stderr, "ERROR: empty file", seq_file exit() if len(lines) == 1: lines.insert(0, ">"+get_prefix(seq_file)+"\n") seq_file = get_prefix(seq_file)+".seq2" with open(seq_file, "w") as f: f.write(lines[0]) f.write(lines[1]) seq = lines[1].split()[0] if len(lines)>2: str_file = get_prefix(seq_file)+".struct" with open(str_file, "w") as f: f.write(lines[2].replace(':', '.')) else: post_proc = False # files: prefix = get_prefix(seq_file) lm_file = prefix+".lm" lmns_file = prefix+".lmns" lmns_tmp = prefix+".lmns_tmp" if (run): # run lms, time, xis = run_adaptive(seq_file, cov_par, iterations, samples, erange) # results printing print_dict(time, "Time:") print_dict(lms, "Local minima found:") print_dict(xis, "Xi-scale used:") # if we run bhg, then we have to filter shallow ones. filter shallow: if (bhg>0): args.shallow_start = int(args.shallow_start*100+0.5) minh = args.shallow_start lmns_file = lm_file+"ns" lmns_tmp = lmns_file+"_tmp" if (num_lines(lmns_file+str(minh)) == 0 and num_lines(lmns_file) == 0) or not args.use_lmns: call("cp "+lm_file+" "+lmns_file, shell=True) else: if num_lines(lmns_file+str(minh)) != 0: print >> sys.stderr, "WARNING: Found lmns_file:", lmns_file+str(minh), ", using it..." else: print >> sys.stderr, "WARNING: Found lmns_file:", lmns_file, ", using it..." if args.shift: shifts = " -m S" minh = args.shallow_start to_call = rnalocmin+" "+params+" "+params_lm+shifts+" -n 1000000 -v 1 -s "+seq_file+" < "+lmns_file+" > "+lmns_tmp+" 2> "+prefix+"tmp.err" print >> sys.stderr, to_call call(to_call, shell = True) call("mv "+lmns_tmp+" "+lmns_file, shell=True) if leave_intermeds: call("cp "+lmns_file+" "+lmns_file+"S", shell=True) print >> sys.stderr, "discarded non-shift LM", seq_file, print >> sys.stderr, ", remains ", num_lines(lmns_file)-1 else: shifts = " -m I" while num_lines(lmns_file)-1>bhg: to_call = rnalocmin+" "+params+" "+params_lm+shifts+" -n 1000000 -v 1 --minh="+str(minh/100.0)+" -s "+seq_file+" < "+lmns_file+" > "+lmns_tmp+" 2> "+prefix+"tmp.err" print >> sys.stderr, to_call print >> sys.stderr, "discarding shallow ones ("+str(minh/100.0)+")", seq_file call(to_call, shell = True) call("mv "+lmns_tmp+" "+lmns_file, shell=True) if leave_intermeds: call("cp "+lmns_file+" "+lmns_file+str(minh), shell=True) print >> sys.stderr, "discarding shallow ones ("+str(minh/100.0)+")", seq_file, print >> sys.stderr, ", remains ", num_lines(lmns_file)-1 minh += 50 if not leave_intermeds: call("rm "+prefix+"tmp.err", shell=True) #build a bhg: print >> sys.stderr, "building a BHGgraph" bhg_file = get_prefix(seq_file)+".bhg" call("cp "+lmns_file+" "+lmns_tmp, shell=True) # add open chain and old tructure and mfe_w/o_PK open_chain = "." * len(seq) if not args.not_mfe: mfe, mfe_en = get_mfe(seq_file) with open(lmns_tmp, "a") as f: print >> f, " ", open_chain, " 0.00" if not args.not_mfe: print >> f, " ", mfe, mfe_en to_call = bhgbuilder+" "+params+" "+params_lm+" < "+lmns_tmp+" > "+bhg_file print >> sys.stderr, to_call call(to_call, shell=True) call("rm "+lmns_tmp, shell=True) print >> sys.stderr, "#################################################" print >> sys.stderr, "QUITTING:", seq_file print >> sys.stderr, "#################################################"