##############################################################
# constants to change
##############################################################

# paths to programs
rnalocmin = "~/coding/RNAlocmin/bin/RNAlocmin"
rnasubopt = "RNAsubopt"   
gfold = "~/software/gfold/src/gfold" 						# needed only for -k/--pknots option
bhgbuilder = "~/coding/DSUeval/bin/BHGbuilder"  # needed only for --bhg option
rnafold = "RNAfold"

# params - parameters to pass to both RNA(subopt, locmin) programs
#  usually just "-d x" for dangling ends, or "-P path.par" for parameter file 
# params_gen , params_lm -- parameters for sructure generating program (RNAsubopt / gfold)
params = "-d0 -P ~/software/ViennaRNA-2.1.1/misc/rna_turner1999.par"
params_gen = ""
params_lm = ""

# maximal xi to precompute
lowest_xi = 1.0
maximal_xi = 10.0

# shall the script leave all generated files? 
leave_intermeds = True

##############################################################
# INIT -- changes from now on on your own risk!
##############################################################
import collections as c
import sys
import argparse
import time
from subprocess import call, PIPE, Popen
import math
import os.path

# time of beginning
time_local = c.defaultdict(float)
time_local[0] = time.time()

#arguments:
parser = argparse.ArgumentParser(description='Script to execute the adaptive LM searching in the RNA folding landscape.', usage='First change the paths to programs in the beginning of %(prog)s, then run "python %(prog)s -I <input_file> [options]"')
parser.add_argument("-i", "--iterations", 	action="store", type=int, 										default=10, 				help="Number of iterations in the adaptive searching step. (Equals number of changes of scaling temperature.)")
parser.add_argument("-I", "--infile",				action="store", type=str, 										required=True, 			help="Input file with a sequence in FASTA format.")
parser.add_argument("-c", "--convergence", 	action="store", type=float, 									default=0.1, 				help="Convergence parameter. Use values between 0.1 and 0.8. Higher values shifts the searching into higher energies.")
parser.add_argument("-s", "--samples", 			action="store", type=int, 										default=10000, 			help="Number of samples to generate in each iteration.")
parser.add_argument("-b", "--bhg", 					action="store", type=int, 										default=0, 					help="Number of non-shallow minima to keep for BHG generation. 0 means BHG is not generated.")
parser.add_argument("--shift", 							action="store_true", 																		 					help="Use the shift move set when discarding shallow minima.")
parser.add_argument("-e", "--erange", 			action="store", type=float, 									default=None, 			help="Energy range (in kcal/mol) from MFE in which we are interested in, do not compute further.")
parser.add_argument("-k", "--pknots", 			action="store_true", 																							help="Use pseudoknots? WARNING: MUCH slower! only up to 100nt recommended.") 
parser.add_argument("--use-lmns", 					action="store_true", 																							help="Use old .lmns file if found") 
parser.add_argument("--skip-run",						action="store_true", 																							help="Skip the main program run? Just do the BHG generation.")
parser.add_argument("--shallow-start",			action="store", type=float, 									default=1.0,				help="When discarding shallow minima, start at this value of shallowness. (in kcal/mol)")
parser.add_argument("--not-mfe",						action="store_true",																							help="Do not append MFE (without pseudoknots) structure to the BHG computation. Appending requires RNAfold to be installed.")
#parser.add_argument("-outfile", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output file -- ")
#parser.add_argument("-I", '--infile',				action="store", type=argparse.FileType('r'), 	default=sys.stdin, 	help="Input file -- sequence in FASTA format.")

args = parser.parse_args()

# pseudoknots
if args.pknots:
	rnasubopt = gfold
	params_lm += " -k"

#scaling: 
scaling_param = "--betaScale="

##############################################################
# PROGRAM
##############################################################
def get_prefix(filename):
	return '.'.join(filename.split('.')[:-1])

def getLTime(timer = 0):
	"""
	Gets time from last call of getLTime().

	@param timer - int - which timer
	@return float - time in secs
	"""
	global time_local
	if timer not in time_local:
		time_local[timer] = time_start
	res = time.time() - time_local[timer] 
	time_local[timer] = time.time()
	return res
	
def frange(start, stop, step):
	"""
	Iterable through floats.
	
	@param start - float - start of iteration
	@param stop - float - stop of iteration
	@param step - float - step of iteration
	@return float - next iteration
	"""
	assert step > 0.0
	total = start
	compo = 0.0
	while total < stop:
		yield total
		y = step - compo
		temp = total + y
		compo = (temp - total) - y
		total = temp
		
def num_lines(fil):
	"""
	Finds out the number of lines in file fil.
	
	@param fil - string - filename
	@return int - number of lines in file
	"""
	if not os.path.isfile(fil): return 0
	p1 = Popen("wc -l < "+fil, shell=True, stdout=PIPE)
	return int(p1.stdout.read())

def print_dict(d, name=None):
	"""
	Prints a directory to output in sorted, nice format
	
	@param d - dict(any) - directory to print
	@param name - string - name of directory
	"""
	if name: print >> sys.stderr,  name
	for a in sorted(d):
		print >> sys.stderr,  a, d[a]

def read(lmfile, highest):
	"""
	Reads file in lmfile to info structure;

	@param lmfile - string - filename to read
	@param highest - float - highest energy to think of
	@return info structure with histogram information
	"""

	info = c.defaultdict(list)
	with open(lmfile, "r") as f:
		for i, line in enumerate(f):
			if i==0:
				seq = line
			else:
				(num, struct, energy, count) = line.strip().split()
				num = int(num)
				energy = float(energy)
				count = int(count)
				if not highest or energy <= highest: 
					info[energy].append(count)
	return info

def extract_bj(info, energy, cumulative = False):
	"""
	Convert info to beta_j^{<e} information 
	
	@param info - info struct - structure with histogram information about b_j
	@param energy - float - energy we are interested in 
	@param cumulative - bool - make the beta_j cumulative?
	@return dict(int) - dictionary of beta_j of energy "energy"
	"""
	arr = c.defaultdict(int)
	for en in info:
		if en > energy: continue
		#print info[en]
		for a in info[en]:
			arr[a]+=1
			
	if cumulative:
		for i in range(len(arr)+1):
			if i==0: arr[i] = 0
			else: arr[i] = arr[i-1] + arr[i]
	return arr
	
def en_Sahoo(info, cov_param=0.1, summed=True):
	"""
	Returns first energy where coverage criterion is not met
	
	@param info - info structure - structure with histogram information
	@param cov_param - float - coverage parameter
	@param summed - bool - use sum version of convergence parameter? (leave it yes, non-summed is crap)
	@return float - energy of first non-covered energy
	"""
	if not cov_param:
		return None

	bjs = c.defaultdict(lambda: c.defaultdict(int))
	for en in info:
		for j in info[en]:
			bjs[en][j] += 1
			bjs[en][0] += 1 # sum

	# add energies
	last_en = None
	en = None
	for en in sorted(bjs):
		if last_en:
			for j in sorted(bjs[last_en]):		
				bjs[en][j] += bjs[last_en][j]
		last_en = en

	for en in sorted(bjs):
		#check only the energies that are dividable by 0.1 without residue, due to their abundancy
		if int(en*10000.0)%1000!=0: continue
		#print en
		
		#finally check the energy
		if bjs[en][1]==0 or (summed and bjs[en][1]<cov_param*bjs[en][0]) or (not summed and bjs[en][1]<cov_param*bjs[en][2]):
			continue # criterion met
		return en

	return en

def gen_etoxi(seq_file, xi, samples=10000):
	"""
	Generates and reads the file of structures, extracts energy and writes a new value into etoxi table.
	
	@param seq_file - string - filename where sequence is stored
	@param xi - float - xi for which these structures are generated
	@param samples - int - number of samples to do
	@return dict - energy to xi table
	"""
	etoxi = c.defaultdict(float)
	prefix = get_prefix(seq_file)
	to_call = rnasubopt+" "+params+" "+params_gen+" -p "+str(samples)+" "+scaling_param+str(xi)+" < "+seq_file+" > "+prefix+"tmp.tmp"
	print >> sys.stderr,  to_call
	call(to_call, shell=True)

	etoxi = read_xi(seq_file, etoxi, prefix+"tmp.tmp", xi)
	call("rm "+prefix+"tmp.tmp", shell=True)
	return etoxi

def read_xi(seq_file, etoxi, xi_file, xi):
	"""
	Reads the file of generated structures, extracts energy and writes a new value into etoxi table.
	
	@param seq_file - string - filename where sequence is stored
	@param etoxi - dict - energy to xi dictionary to expand
	@param xi_file - string - filename, where structures are stored
	@param xi - float - xi for which these structures were generated
	@return dict - updated etoxi table
	"""
	# crawl through the file
	summ = 0.0
	count = 0

	#print etoxi, xi_file, xi
	#first make sure the file has all the energies and it is LM file
	prefix = get_prefix(seq_file)
	to_call = rnalocmin+" "+params+" "+params_lm+" -s "+seq_file+" < "+xi_file+" > "+prefix+"tmp.lm"
	#print to_call
	call(to_call, shell=True)

	if xi in etoxi: return etoxi	

	with open(prefix+"tmp.lm", "r") as f:
		for a in f: 
			split = a.split()
			if len(split) < 2: continue
			#find the energy:
			#print split
			for a in split[1:]:
				try: en = float(a)
				except ValueError: continue
				summ += en
				count += 1
				#print en
				break

	# fill the etoxi dict
	if count!=0: etoxi[summ/float(count)] = xi

	if not leave_intermeds: call("rm "+prefix+"tmp.lm", shell=True)

	# throw out the maximum value -- unreliable
		
	return etoxi

def get_xi(etoxi, energy, rescale=True):
	"""
	Get best xi for asked energy.
	
	@param etoxi - output from get_xis
	@param energy - float - energy of interest
	@param rescale - bool - if true use two closest energy and rescale linearly 
													if false use just xi for closest energy
	@return float - approximate best xi for desired energy range
	"""
	#for a in sorted(etoxi):
		#print a, etoxi[a]
	#print energy
	last_en = -1e10
	last_xi = 0.0
	for en in sorted(etoxi):
		if energy < en:
			if rescale: 
				if last_en == -1e10:
					return etoxi[en]
				else:
					return round(((energy-last_en)/(en-last_en)*(etoxi[en]-etoxi[last_en])+etoxi[last_en])*1000)/1000.0
			else:
				if abs(energy-last_en)<abs(en-energy):
					return etoxi[last_en]
				else:
					return etoxi[en]
		last_en = en
		last_xi = etoxi[en]	
	return last_xi

def get_mfe(seq_file):
	"""
	Gets the mfe (non-pk version) by running the RNAfold.

	@param seq_file - string - filename for sequence
	@return string, string - mfe structure and its energy in string
	"""
	mfe_file = get_prefix(seq_file)+".mfe"
	to_call = rnafold+" "+params+" < "+seq_file+" > "+mfe_file
	print >> sys.stderr,  to_call
	call(to_call, shell=True)
	
	with open(mfe_file, "r") as f:
		split = f.readlines()[2].split()
		mfe = split[0]
		energy_mfe = "".join(split[1:])

	#print mfe, energy_mfe
	return mfe, energy_mfe[1:-1]

def run_RNAlocmin_once(seq_file, xi, samples, minima=1000000, erange=None, lm_file=None):
	"""
	Run one iteration of adaptive sampling and LM finding
	
	@param seq_file - string - file for sequence file
	@param xi - float - xi scale for sampling
	@param samples - int - number of samples
	@param erange - float - range of energy we are interested in
	@param lm_file - string - if not first run - lm_file is available, use it 
	@return string, string - filename, where new lm_file is stored, filename, where the samples produced as stored
	"""
	prefix = get_prefix(seq_file)
	if erange!=None:
		er = " --eRange=="+erange
	else: er = ""
	global iteration_num
	to_call = rnasubopt+" "+params+" "+params_gen+" -p "+str(samples)+" "+scaling_param+str(xi)+" < "+seq_file+" > "+prefix+"."+str(xi) + " 2> "+prefix+"tmp.err"
	print >> sys.stderr, to_call
	call(to_call, shell=True)
	if lm_file == None:
		to_call = rnalocmin+" "+params+" "+params_lm+" -s "+seq_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp" + " 2> "+prefix+"tmp.err"
	else:
		to_call = rnalocmin+" "+params+" "+params_lm+" -p "+lm_file+" -n "+str(minima)+er+" < "+prefix+"."+str(xi)+" > "+prefix+".lmtmp" + " 2> "+prefix+"tmp.err"
	print >> sys.stderr, to_call
	call(to_call, shell=True)
	call("mv "+prefix+".lmtmp "+prefix+".lm", shell=True)
	if not leave_intermeds: call("rm "+prefix+"."+str(xi)+" "+prefix+"tmp.err", shell=True)
	return prefix+".lm", prefix+"."+str(xi)

def run_adaptive(seq_file, x=0.1, iterations=10, samples_per_run=10000, erange=None):
	"""
	Runs LM searching with adaptive xi scaling

	@param seq_file - string - filename with sequence
	@param x - float - criterion parameter 
	@param iterations - int - max. how many iterations
	@param erange - float - energy range we are interested in (if None - just do max iterations)
	@param samples_per_run - int - samples per iteration
	
	@return creates file with LM in erange from mfe to sequence seq_file
	"""
	xis = c.defaultdict(float)
	lms = c.defaultdict(float)
	belows = []
	time = c.defaultdict(float)
	multi = x

	# run first time
	xi = lowest_xi
	getLTime()
	lm_file, xi_file = run_RNAlocmin_once(seq_file, xi, samples_per_run)
	time_other = getLTime()
	print >> sys.stderr, "iteration no.", str(0), "in", time_other, "secs."
	
	#get minimal energy
	mfe, mfe_en = get_mfe(seq_file)
	mfe_en = float(mfe_en)
	
	#if we have energy range, set it:
	if erange:
		highest = mfe_en + erange
	else: highest = None
	
	# construct xi-to-energy table
	etoxi = gen_etoxi(seq_file, maximal_xi)
	etoxi = read_xi(seq_file, etoxi, xi_file, xi)
	if not leave_intermeds: call("rm "+xi_file, shell=True)
	print_dict(etoxi)
	getLTime()
	i = 1
	lms[i*samples_per_run] = num_lines(lm_file)-1
	time[i*samples_per_run] = time_other
	xis[i*samples_per_run] = xi

	# and iterate until we have reached final enery or maximal number of iterations
	below = None
	while (below is None or highest is None or below<=mfe_en+erange) and (iterations is None or i<iterations):
		i += 1
		
		#read the created lm-file
		info = read(lm_file, highest)
		#get energy of interest
		below = en_Sahoo(info, multi)
		if not below:
			print >> sys.stderr, "Empty info, corrupted", lm_file, "? Exitting..."
			exit()
		#get corresponding xi-scale for the energy
		xi = get_xi(etoxi, below, True)
		print >> sys.stderr, "criterion not met at energy:", below, "xi:", xi
		belows.append(below)
		
		# actual run
		if leave_intermeds: call("cp "+lm_file+" "+lm_file+str(i-1), shell=True)
		lm_file, xi_file = run_RNAlocmin_once(seq_file, xi, samples_per_run, lm_file=lm_file)
		tim_tmp = getLTime()
		print >> sys.stderr, "iteration no.", i-1, "in", tim_tmp, "secs."
		
		# update the xi table:
		etoxi = read_xi(seq_file, etoxi, xi_file, xi)
		if not leave_intermeds: call("rm "+xi_file, shell=True)
		print_dict(etoxi)		

		#just keeping the scores
		time_other += tim_tmp
		lms[i*samples_per_run] = num_lines(lm_file)-1
		time[i*samples_per_run] = time_other
		xis[i*samples_per_run] = xi

	return (lms, time, xis)

##############################################################
# MAIN body of the program
##############################################################

#arguments parsing:
seq_file = args.infile
cov_par = args.convergence
iterations = args.iterations
samples = args.samples
erange = args.erange

#flags:
run = not args.skip_run
bhg = args.bhg


# print what is on:
print >> sys.stderr,  "#################################################"
print >> sys.stderr,  seq_file
print >> sys.stderr,  "#################################################"

#countdown
for i in range(3,0,-1):
	print >> sys.stderr,  "start in", i
	time.sleep(1)

# first fix the input:
with open(seq_file, "r") as f:
	lines = f.readlines()

if len(lines) == 0:
	print >> sys.stderr,  "ERROR: empty file", seq_file
	exit()

if len(lines) == 1:
	lines.insert(0, ">"+get_prefix(seq_file)+"\n")

seq_file = get_prefix(seq_file)+".seq2"
with open(seq_file, "w") as f:
	f.write(lines[0])
	f.write(lines[1])
	seq = lines[1].split()[0]

if len(lines)>2:
	str_file = get_prefix(seq_file)+".struct"
	with open(str_file, "w")  as f:
		f.write(lines[2].replace(':', '.'))
else:
	post_proc = False

# files:
prefix = get_prefix(seq_file)
lm_file = prefix+".lm"
lmns_file = prefix+".lmns"
lmns_tmp = prefix+".lmns_tmp"

if (run):
	# run
	lms, time, xis = run_adaptive(seq_file, cov_par, iterations, samples, erange)
	# results printing
	print_dict(time, "Time:")
	print_dict(lms, "Local minima found:")
	print_dict(xis, "Xi-scale used:")

# if we run bhg, then we have to filter shallow ones. filter shallow:
if (bhg>0):
	args.shallow_start = int(args.shallow_start*100+0.5)
	minh = args.shallow_start
	lmns_file = lm_file+"ns"
	lmns_tmp = lmns_file+"_tmp"

	if (num_lines(lmns_file+str(minh)) == 0 and num_lines(lmns_file) == 0) or not args.use_lmns:
		call("cp "+lm_file+" "+lmns_file, shell=True)
	else: 
		if num_lines(lmns_file+str(minh)) != 0:
			print >> sys.stderr,  "WARNING: Found lmns_file:", lmns_file+str(minh), ", using it..."
		else:
			print >> sys.stderr,  "WARNING: Found lmns_file:", lmns_file, ", using it..."

	if args.shift:
		shifts = " -m S"
		minh = args.shallow_start
		to_call = rnalocmin+" "+params+" "+params_lm+shifts+" -n 1000000 -v 1 -s "+seq_file+" < "+lmns_file+" > "+lmns_tmp+" 2> "+prefix+"tmp.err"
		print >> sys.stderr,  to_call
		call(to_call, shell = True)
		call("mv "+lmns_tmp+" "+lmns_file, shell=True)
		if leave_intermeds: call("cp "+lmns_file+" "+lmns_file+"S", shell=True)
		print >> sys.stderr,  "discarded non-shift LM", seq_file,
		print >> sys.stderr,  ", remains ", num_lines(lmns_file)-1
		
	else: 
		shifts = " -m I"
		

	while num_lines(lmns_file)-1>bhg:
		to_call = rnalocmin+" "+params+" "+params_lm+shifts+" -n 1000000 -v 1 --minh="+str(minh/100.0)+" -s "+seq_file+" < "+lmns_file+" > "+lmns_tmp+" 2> "+prefix+"tmp.err"
		print >> sys.stderr,  to_call
		print >> sys.stderr,  "discarding shallow ones ("+str(minh/100.0)+")", seq_file
		call(to_call, shell = True)
		call("mv "+lmns_tmp+" "+lmns_file, shell=True)
		if leave_intermeds: call("cp "+lmns_file+" "+lmns_file+str(minh), shell=True)
		print >> sys.stderr,  "discarding shallow ones ("+str(minh/100.0)+")", seq_file,
		print >> sys.stderr,  ", remains ", num_lines(lmns_file)-1
		minh += 50
	
	if not leave_intermeds: call("rm "+prefix+"tmp.err", shell=True)

	#build a bhg:
	print >> sys.stderr,  "building a BHGgraph"
	bhg_file = get_prefix(seq_file)+".bhg"
	call("cp "+lmns_file+" "+lmns_tmp, shell=True)

	# add open chain and old tructure and mfe_w/o_PK
	open_chain = "." * len(seq)
	if not args.not_mfe: mfe, mfe_en = get_mfe(seq_file)
	
	with open(lmns_tmp, "a") as f:
		print >> f, "    ", open_chain, "  0.00"
		if not args.not_mfe: print >> f, "    ", mfe, mfe_en	
	
	to_call = bhgbuilder+" "+params+" "+params_lm+" < "+lmns_tmp+" > "+bhg_file
	print >> sys.stderr,  to_call
	call(to_call, shell=True)

	call("rm "+lmns_tmp, shell=True)

print >> sys.stderr,  "#################################################"
print >> sys.stderr,  "QUITTING:", seq_file
print >> sys.stderr,  "#################################################"