##############################################################
# Constants to change
##############################################################


# paths to programs - if they are globally installed, just leave it as it is:
rnalocmin = "RNAlocmin"
rnasubopt = "RNAsubopt"

# parameters to pass to both RNA(sobopt, locmin) programs
#  usually just "-d x" for dangling ends, or "-P path.par" for parameter file 
params = "-d 0"

# maximal xi to pre-recompute
maximal_xi = 2.5


##############################################################
# PROGRAM - do not change, unless you know what are you doing
##############################################################
import collections as c
import sys
import time
from subprocess import call, PIPE, Popen
import math
import os

#time of beginning
time_local = c.defaultdict(float)
time_local[0] = time.time()

def which(program):
	"""
	Simulate "which" command from unix -- tries to find out if the program from input is executable or installed.

	@param program - filename - program to check
	@return filename - filename of executable program or None if there does not exist such a program
	"""
  def is_exe(fpath):
    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

  fpath, fname = os.path.split(program)
  if fpath:
    if is_exe(program):
      return program
  else:
    for path in os.environ["PATH"].split(os.pathsep):
      path = path.strip('"')
      exe_file = os.path.join(path, program)
      if is_exe(exe_file):
        return exe_file

  return None

def getLTime(timer = 0):
	"""
	Gets time from last call of getLTime().

	@param timer - int - which timer
	@return float - time in secs
	"""
	global time_local
	if timer not in time_local:
		time_local[timer] = time_start
	res = time.time() - time_local[timer] 
	time_local[timer] = time.time()
	return res
	
def frange(start, stop, step):
	"""
	Iterable through floats.
	
	@param start - float - start of iteration
	@param stop - float - stop of iteration
	@param step - float - step of iteration
	@return float - next iteration
	"""
	assert step > 0.0
	total = start
	compo = 0.0
	while total < stop:
		yield total
		y = step - compo
		temp = total + y
		compo = (temp - total) - y
		total = temp
		
def num_lines(fil):
	"""
	Finds out the number of lines in file fil.
	
	@param fil - string - filename
	@return int - number of lines in file
	"""
	if not os.path.isfile(fil): return 0
	p1 = Popen("wc -l < "+fil, shell=True, stdout=PIPE)
	return int(p1.stdout.read())

def print_dict(d, name=None):
	"""
	Prints a directory to output in sorted, nice format
	
	@param d - dict(any) - directory to print
	@param name - string - name of directory
	"""
	if name: print name
	for a in sorted(d):
		print a, d[a]

def read(lmfile, highest):
	"""
	Reads file in lmfile to info structure;

	@param lmfile - string - filename to read
	@param highest - float - highest energy to think of
	@return info structure with histogram information
	"""

	info = c.defaultdict(list)
	with open(lmfile, "r") as f:
		for i, line in enumerate(f):
			if i==0:
				seq = line
			else:
				(num, struct, energy, count) = line.strip().split()
				num = int(num)
				energy = float(energy)
				count = int(count)
				if not highest or energy <= highest: 
					info[energy].append(count)
	return info

def extract_bj(info, energy, cumulative = False):
	"""
	Convert info to beta_j^{<e} information 
	
	@param info - info struct - structure with histogram information about b_j
	@param energy - float - energy we are interested in 
	@param cumulative - bool - make the beta_j cumulative?
	@return dict(int) - dictionary of beta_j of energy "energy"
	"""
	arr = c.defaultdict(int)
	for en in info:
		if en > energy: continue
		#print info[en]
		for a in info[en]:
			arr[a]+=1
			
	if cumulative:
		for i in range(len(arr)+1):
			if i==0: arr[i] = 0
			else: arr[i] = arr[i-1] + arr[i]
	return arr
	
def en_Sahoo(info, cov_param=0.1, summed=True):
	"""
	Returns first energy where coverage criterion is not met
	
	@param info - info structure - structure with histogram information
	@param cov_param - float - coverage parameter
	@param summed - bool - use sum version of convergence arameter? (leave it yes, non-summed is crap)
	@return float - energy of first non-covered energy
	"""
	if not cov_param:
		return None

	bjs = c.defaultdict(lambda: c.defaultdict(int))
	for en in info:
		for j in info[en]:
			bjs[en][j] += 1
			bjs[en][0] += 1 # sum

	# add energies
	last_en = None
	for en in sorted(bjs):
		if last_en:
			for j in sorted(bjs[last_en]):		
				bjs[en][j] += bjs[last_en][j]
		last_en = en

	for en in sorted(bjs):
		if bjs[en][1]==0 or (summed and bjs[en][1]<cov_param*bjs[en][0]) or (not summed and bjs[en][1]<cov_param*bjs[en][2]):
			continue # criterion met
		return en

	return en

def get_xis(seq_file, highest, highest_xi=2.5, gap=0.1):
	"""
	Get energy to best_xi approximation by sampling
	
	@param seq_file - string - file with sequence of interest
	@param highest - float - highest energy to look xi
	@param highest_xi - float - highest xi to look for energy
	@param gap - float - xi step
	@return dict[energy] = best xi
	"""
	samples = 1000
	last_med = 500
	e_to_xi = c.defaultdict(float)
	prefix = seq_file.split('.')[0]
	for xi in frange(0.4, highest_xi+0.001, gap):
		call(rnasubopt+" "+params+" -p "+str(samples)+" --betaScale="+str(xi)+" < "+seq_file+" > "+prefix+"tmp."+str(xi), shell=True)
		call("cat "+seq_file+" "+prefix+"tmp."+str(xi)+" > "+prefix+"tmpw."+str(xi), shell=True)
		call(rnalocmin+" "+params+" -s "+seq_file+" -n 1000000 < "+prefix+"tmpw."+str(xi)+" > "+prefix+"tmpe."+str(xi)+" 2> tmp.err", shell=True)	
		with open(prefix+"tmpe."+str(xi), "r") as f:
			energies = []
			numbers = []
			for i, line in enumerate(f):
				if i==0: continue
				split = line.strip().split()
				if len(split) < 4:
					#if verbose: print >> sys.stderr, "wrong line:\n", line 
					continue
				energies.append(float(split[2]))
				numbers.append(int(split[3]))
		
		#summ
		count = 0
		summed = 0
		for i, a in enumerate(energies):
			summed += a*numbers[i]
			count += numbers[i]
			if count>=samples/2:
				median = a
				count=-samples

		average = summed/float(samples)
		print >> sys.stderr, xi, median
		e_to_xi[median] = xi 
		call("rm "+prefix+"tmpw."+str(xi)+" "+prefix+"tmpe."+str(xi)+" "+prefix+"tmp."+str(xi), shell=True)
		# end when we have all needed or it is not changing much 
		if (highest is not None and median > highest) or (xi>1.5 and abs(last_med-median) < 0.05): break
		last_med = median
	return e_to_xi

def get_xi(etoxi, energy, rescale=True):
	"""
	Get best xi for asked energy.
	
	@param etoxi - output from get_xis
	@param energy - float - energy of interest
	@param rescale - bool - if true use two closest energy and rescale linearly 
													if false use just xi for closest energy
	@return float - approximate best xi for desired energy range
	"""
	#for a in sorted(etoxi):
		#print a, etoxi[a]
	#print energy
	last_en = -1e10
	last_xi = 0.0
	for en in sorted(etoxi):
		if energy < en:
			if rescale: 
				if last_en == -1e10:
					return etoxi[en]
				else:
					return round(((energy-last_en)/(en-last_en)*(etoxi[en]-etoxi[last_en])+etoxi[last_en])*1000)/1000.0
			else:
				if abs(energy-last_en)<abs(en-energy):
					return etoxi[last_en]
				else:
					return etoxi[en]
		last_en = en
		last_xi = etoxi[en]	
	return last_xi

def get_mfe(lm_file):
	"""
	Gets energy of first(lowest) LM from lm_file
	
	@param lm_file - string - filename
	@return float - energy of first LM in file
	"""
	with open(lm_file, "r") as f:
		mfe = float(f.readlines()[1].strip().split()[2])
		return mfe

def run_RNAlocmin_once(seq_file, xi, samples, minima=1000000, erange=None, lm_file=None):
	"""
	Run one iteration of adaptive sampling and LM finding
	
	@param seq_file - string - file for sequence file
	@param xi - float - xi scale for sampling
	@param samples - int - number of samples
	@param erange - float - range of energy we are interested in
	@param lm_file - string - if not first run - lm_file is available, use it 
	@return string - filename, where new lm_file is stored
	"""
	prefix = seq_file.split('.')[0]
	if erange!=None:
		er = " --eRange=="+erange
	else: er = ""
	global iteration_num
	to_call = rnasubopt+" "+params+" -p "+str(samples)+" --betaScale="+str(xi)+" < "+seq_file+" > "+prefix+"."+str(xi)
	print >> sys.stderr, to_call
	call(to_call, shell=True)
	if lm_file == None:
		to_call = rnalocmin+" "+params+" -s "+seq_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp"
	else:
		to_call = rnalocmin+" "+params+" -p "+lm_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp"
	print >> sys.stderr, to_call
	call(to_call, shell=True)
	call("mv "+prefix+".lmtmp "+prefix+".lm", shell=True)
	call("rm "+prefix+"."+str(xi), shell=True)
	return prefix+".lm"

def run_adaptive(seq_file, x=0.1, iterations=10, samples_per_run=10000, erange=None):
	"""
	Runs all - LM searching with adaptive xi scaling
	@param seq_file - string - filename with sequence
	@param x - float - criterion parameter 
	@param iterations - int - max. how many iterations
	@param erange - float - energy range we are interested in (if None - just do max iterations)
	@param samples_per_run - int - samples per iteration
	
	@return creates file with LM in erange from mfe to sequence seq_file
	"""
	xis = c.defaultdict(float)
	lms = c.defaultdict(float)
	belows = []
	time = c.defaultdict(float)
	multi = x

	# run first time
	xi = 1.0
	getLTime()
	lm_file = run_RNAlocmin_once(seq_file, xi, samples_per_run)
	time_other = getLTime()
	print >> sys.stderr, "iteration no.", str(0), "in", time_other, "secs."
	
	#get minimal energy
	mfe = get_mfe(lm_file)
	
	#if we have energy range, set it:
	if erange:
		highest = mfe + erange
	else: highest = None
	
	# construct xi-to-energy table
	etoxi = get_xis(seq_file, highest, highest_xi=maximal_xi)
	getLTime()
	i = 1
	lms[i*samples_per_run] = num_lines(lm_file)-1
	time[i*samples_per_run] = time_other
	xis[i*samples_per_run] = xi

	# and iterate until we have reached final enery or maximal number of iterations
	below = None
	while (below is None or highest is None or below<=mfe+erange) and (iterations is None or i<iterations):
		i += 1
		
		#read the created lm-file
		info = read(lm_file, highest)
		#get energy of interest
		below = en_Sahoo(info, multi)
		#get corresponding xi-scale for the energy
		xi = get_xi(etoxi, below, True)
		print >> sys.stderr, "criterion not met at energy:", below, "xi:", xi
		belows.append(below)
		
		# actual run
		lm_file = run_RNAlocmin_once(seq_file, xi, samples_per_run, lm_file=lm_file)
		tim_tmp = getLTime()
		print >> sys.stderr, "iteration no.", i-1, "in", tim_tmp, "secs."
		
		#just keeping the scores
		time_other += tim_tmp
		lms[i*samples_per_run] = num_lines(lm_file)-1
		time[i*samples_per_run] = time_other
		xis[i*samples_per_run] = xi

	return (lms, time, xis)

def usage():
	print "Usage: python", sys.argv[0], "sequence_filename_in_FASTA [coverage parameter=0.1] [number of iterations=10] [samples per iteration=10000] [energy range=None]"
	print "\t make sure that the program paths in the top of this file are correct!!!"
	print "\tsequence file - should be in format accepted by RNAsubopt program (simple FASTA file)"
	print "\tcoverage parameter - recommended values <0.05 ; 0.8>; higher values yield more local minima; lower values increase the chance to get all the low energy minima"
	print "\tenergy range - if the coverage criterion is met in the desired energy range from MFE, the program stops before all iterations are finished (default = proceed with all iterations)"
	exit()

#arguments parsing:
arguments = len(sys.argv)
if arguments < 2:
	usage()

seq_file = sys.argv[1]

# check the existence of all necessary programs and the input file
if not which(rnalocmin): 
	if which("./"+rnalocmin): rnalocmin = "./"+rnalocmin
	else: 
		print "ERROR: \"" +rnalocmin+ "\" is not found or it is not an executable, please correct the location of the RNAlocmin program in the top of this script!\n"
		usage()
if not which(rnasubopt): 
	if which("./"+rnasubopt): rnasubopt = "./"+rnasubopt
	else: 
		print "ERROR: \"" +rnasubopt+ "\" is not found or it is not an executable, please correct the location of the RNAsubopt program in the top of this script!"
		usage()

if not os.path.isfile(seq_file): 
	print "ERROR: input file \"" +seq_file+ "\" is not found!"
	usage()

#default args
cov_par = 0.1
iterations = 10
samples = 10000
erange = None

if arguments>2:
	cov_par = float(sys.argv[2])
if arguments>3:
	iterations = int(sys.argv[3])
if arguments>4:
	samples = int(sys.argv[4])
if arguments>5:
	erange = float(sys.argv[5])

# run
lms, time, xis = run_adaptive(seq_file, cov_par, iterations, samples, erange)
# results printing
print_dict(time, "Time (without generating xi-scale table):")
print_dict(lms, "Local minima found:")
print_dict(xis, "Xi-scale used:")