############################################################## # Constants to change ############################################################## # paths to programs - if they are globally installed, just leave it as it is: rnalocmin = "RNAlocmin" rnasubopt = "RNAsubopt" # parameters to pass to both RNA(sobopt, locmin) programs # usually just "-d x" for dangling ends, or "-P path.par" for parameter file params = "-d 0" # maximal xi to pre-recompute maximal_xi = 2.5 ############################################################## # PROGRAM - do not change, unless you know what are you doing ############################################################## import collections as c import sys import time from subprocess import call, PIPE, Popen import math import os #time of beginning time_local = c.defaultdict(float) time_local[0] = time.time() def which(program): """ Simulate "which" command from unix -- tries to find out if the program from input is executable or installed. @param program - filename - program to check @return filename - filename of executable program or None if there does not exist such a program """ def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None def getLTime(timer = 0): """ Gets time from last call of getLTime(). @param timer - int - which timer @return float - time in secs """ global time_local if timer not in time_local: time_local[timer] = time_start res = time.time() - time_local[timer] time_local[timer] = time.time() return res def frange(start, stop, step): """ Iterable through floats. @param start - float - start of iteration @param stop - float - stop of iteration @param step - float - step of iteration @return float - next iteration """ assert step > 0.0 total = start compo = 0.0 while total < stop: yield total y = step - compo temp = total + y compo = (temp - total) - y total = temp def num_lines(fil): """ Finds out the number of lines in file fil. @param fil - string - filename @return int - number of lines in file """ if not os.path.isfile(fil): return 0 p1 = Popen("wc -l < "+fil, shell=True, stdout=PIPE) return int(p1.stdout.read()) def print_dict(d, name=None): """ Prints a directory to output in sorted, nice format @param d - dict(any) - directory to print @param name - string - name of directory """ if name: print name for a in sorted(d): print a, d[a] def read(lmfile, highest): """ Reads file in lmfile to info structure; @param lmfile - string - filename to read @param highest - float - highest energy to think of @return info structure with histogram information """ info = c.defaultdict(list) with open(lmfile, "r") as f: for i, line in enumerate(f): if i==0: seq = line else: (num, struct, energy, count) = line.strip().split() num = int(num) energy = float(energy) count = int(count) if not highest or energy <= highest: info[energy].append(count) return info def extract_bj(info, energy, cumulative = False): """ Convert info to beta_j^{ energy: continue #print info[en] for a in info[en]: arr[a]+=1 if cumulative: for i in range(len(arr)+1): if i==0: arr[i] = 0 else: arr[i] = arr[i-1] + arr[i] return arr def en_Sahoo(info, cov_param=0.1, summed=True): """ Returns first energy where coverage criterion is not met @param info - info structure - structure with histogram information @param cov_param - float - coverage parameter @param summed - bool - use sum version of convergence arameter? (leave it yes, non-summed is crap) @return float - energy of first non-covered energy """ if not cov_param: return None bjs = c.defaultdict(lambda: c.defaultdict(int)) for en in info: for j in info[en]: bjs[en][j] += 1 bjs[en][0] += 1 # sum # add energies last_en = None for en in sorted(bjs): if last_en: for j in sorted(bjs[last_en]): bjs[en][j] += bjs[last_en][j] last_en = en for en in sorted(bjs): if bjs[en][1]==0 or (summed and bjs[en][1] "+prefix+"tmp."+str(xi), shell=True) call("cat "+seq_file+" "+prefix+"tmp."+str(xi)+" > "+prefix+"tmpw."+str(xi), shell=True) call(rnalocmin+" "+params+" -s "+seq_file+" -n 1000000 < "+prefix+"tmpw."+str(xi)+" > "+prefix+"tmpe."+str(xi)+" 2> tmp.err", shell=True) with open(prefix+"tmpe."+str(xi), "r") as f: energies = [] numbers = [] for i, line in enumerate(f): if i==0: continue split = line.strip().split() if len(split) < 4: #if verbose: print >> sys.stderr, "wrong line:\n", line continue energies.append(float(split[2])) numbers.append(int(split[3])) #summ count = 0 summed = 0 for i, a in enumerate(energies): summed += a*numbers[i] count += numbers[i] if count>=samples/2: median = a count=-samples average = summed/float(samples) print >> sys.stderr, xi, median e_to_xi[median] = xi call("rm "+prefix+"tmpw."+str(xi)+" "+prefix+"tmpe."+str(xi)+" "+prefix+"tmp."+str(xi), shell=True) # end when we have all needed or it is not changing much if (highest is not None and median > highest) or (xi>1.5 and abs(last_med-median) < 0.05): break last_med = median return e_to_xi def get_xi(etoxi, energy, rescale=True): """ Get best xi for asked energy. @param etoxi - output from get_xis @param energy - float - energy of interest @param rescale - bool - if true use two closest energy and rescale linearly if false use just xi for closest energy @return float - approximate best xi for desired energy range """ #for a in sorted(etoxi): #print a, etoxi[a] #print energy last_en = -1e10 last_xi = 0.0 for en in sorted(etoxi): if energy < en: if rescale: if last_en == -1e10: return etoxi[en] else: return round(((energy-last_en)/(en-last_en)*(etoxi[en]-etoxi[last_en])+etoxi[last_en])*1000)/1000.0 else: if abs(energy-last_en) "+prefix+"."+str(xi) print >> sys.stderr, to_call call(to_call, shell=True) if lm_file == None: to_call = rnalocmin+" "+params+" -s "+seq_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp" else: to_call = rnalocmin+" "+params+" -p "+lm_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp" print >> sys.stderr, to_call call(to_call, shell=True) call("mv "+prefix+".lmtmp "+prefix+".lm", shell=True) call("rm "+prefix+"."+str(xi), shell=True) return prefix+".lm" def run_adaptive(seq_file, x=0.1, iterations=10, samples_per_run=10000, erange=None): """ Runs all - LM searching with adaptive xi scaling @param seq_file - string - filename with sequence @param x - float - criterion parameter @param iterations - int - max. how many iterations @param erange - float - energy range we are interested in (if None - just do max iterations) @param samples_per_run - int - samples per iteration @return creates file with LM in erange from mfe to sequence seq_file """ xis = c.defaultdict(float) lms = c.defaultdict(float) belows = [] time = c.defaultdict(float) multi = x # run first time xi = 1.0 getLTime() lm_file = run_RNAlocmin_once(seq_file, xi, samples_per_run) time_other = getLTime() print >> sys.stderr, "iteration no.", str(0), "in", time_other, "secs." #get minimal energy mfe = get_mfe(lm_file) #if we have energy range, set it: if erange: highest = mfe + erange else: highest = None # construct xi-to-energy table etoxi = get_xis(seq_file, highest, highest_xi=maximal_xi) getLTime() i = 1 lms[i*samples_per_run] = num_lines(lm_file)-1 time[i*samples_per_run] = time_other xis[i*samples_per_run] = xi # and iterate until we have reached final enery or maximal number of iterations below = None while (below is None or highest is None or below<=mfe+erange) and (iterations is None or i> sys.stderr, "criterion not met at energy:", below, "xi:", xi belows.append(below) # actual run lm_file = run_RNAlocmin_once(seq_file, xi, samples_per_run, lm_file=lm_file) tim_tmp = getLTime() print >> sys.stderr, "iteration no.", i-1, "in", tim_tmp, "secs." #just keeping the scores time_other += tim_tmp lms[i*samples_per_run] = num_lines(lm_file)-1 time[i*samples_per_run] = time_other xis[i*samples_per_run] = xi return (lms, time, xis) def usage(): print "Usage: python", sys.argv[0], "sequence_filename_in_FASTA [coverage parameter=0.1] [number of iterations=10] [samples per iteration=10000] [energy range=None]" print "\t make sure that the program paths in the top of this file are correct!!!" print "\tsequence file - should be in format accepted by RNAsubopt program (simple FASTA file)" print "\tcoverage parameter - recommended values <0.05 ; 0.8>; higher values yield more local minima; lower values increase the chance to get all the low energy minima" print "\tenergy range - if the coverage criterion is met in the desired energy range from MFE, the program stops before all iterations are finished (default = proceed with all iterations)" exit() #arguments parsing: arguments = len(sys.argv) if arguments < 2: usage() seq_file = sys.argv[1] # check the existence of all necessary programs and the input file if not which(rnalocmin): if which("./"+rnalocmin): rnalocmin = "./"+rnalocmin else: print "ERROR: \"" +rnalocmin+ "\" is not found or it is not an executable, please correct the location of the RNAlocmin program in the top of this script!\n" usage() if not which(rnasubopt): if which("./"+rnasubopt): rnasubopt = "./"+rnasubopt else: print "ERROR: \"" +rnasubopt+ "\" is not found or it is not an executable, please correct the location of the RNAsubopt program in the top of this script!" usage() if not os.path.isfile(seq_file): print "ERROR: input file \"" +seq_file+ "\" is not found!" usage() #default args cov_par = 0.1 iterations = 10 samples = 10000 erange = None if arguments>2: cov_par = float(sys.argv[2]) if arguments>3: iterations = int(sys.argv[3]) if arguments>4: samples = int(sys.argv[4]) if arguments>5: erange = float(sys.argv[5]) # run lms, time, xis = run_adaptive(seq_file, cov_par, iterations, samples, erange) # results printing print_dict(time, "Time (without generating xi-scale table):") print_dict(lms, "Local minima found:") print_dict(xis, "Xi-scale used:")