import sys,os,pprint,itertools,re

import time

sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)),"../libraries/"))

import ned_basicReader as basicReader
import ned_dsspHelper as dsspHelper
import ned_anchorHelper as anchorHelper
import ned_basic as basic
import ned_commandLine as commandline
import ned_disorderHelper as disorderHelper
import ned_conservationScorer as conservationHelper
import ned_alignmentHelper as alignmentHelper
import ned_fastaHelper as fastaHelper
import ned_uniprotHelper as uniprotHelper
import ned_mutationHelper as mutationHelper
import ned_motifHelper as motifHelper
import ned_stats as stats
import ned_pfam as pfam

class proteinInfoHelper():
	def __init__(self,disorder=None,pfam=None,features=None):
		self.data = {}
		
		cmdline = commandline.CommandLine()
		
		if len(sys.argv) > 3:
			self.options = cmdline.loadIniFile(os.path.join(os.path.dirname(os.path.realpath(__file__)),"../settings/utilities.ini"),sys.argv[3])
		else:
			self.options = cmdline.loadIniFile(os.path.join(os.path.dirname(os.path.realpath(__file__)),"../settings/utilities.ini"),notify=True)
		 
		self.options["fastaDir"] = os.path.join(self.options['uniprot_dir'],"fasta")
		self.options["orthdb"] = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),self.options["orthdb"]))
		
		self.mutHelper = None
		
		
	def filterFeatures(self):
		if "ignoreList" not in self.data:
			self.data["ignoreList"] = [""]*len(self.data["Sequence"])
			
		try:
			if "ABV" in self.data:
			
				for i in range(0,len(self.data["Sequence"])):
					if float(self.data["ABV"][i])/float(self.data["ABS"][i]) < 0.75:
						self.data["ignoreList"][i] = "Gappy"
					 
			for feature in self.data["Feature"]:
				#print feature
				if feature['Type'] in ["DOMAIN","TRANSMEM"]:
				
					for i in range(int(feature['Start']) - 1,int(feature['End'])):
						self.data["ignoreList"][i] = feature['Type'][0:6]
						
						
				if feature['Type'] in ["TOPO_DOM"]:
					if feature['Desc'].upper().find('EXTRACELLULAR') != -1:
						
						for i in range(int(feature['Start']) - 1,int(feature['End'])):
							self.data["ignoreList"][i] = "EXTRA"
			
			for orderedRegion in self.data["RegionOrder"]:
				for i in range(int(orderedRegion[0]) - 1,int(orderedRegion[1])):
					self.data["ignoreList"][i] = "Order"
					
			for domain in self.data["domains"]:
				for hit in self.data["domains"][domain]["hits"]:
					if 'start' in hit and hit['disorder'] < float(self.options['disorderCutoff']):
						for i in range(int(hit['start']) - 1,int(hit['end'])):
							self.data["ignoreList"][i] = domain
			
			for i in range(0,len(self.data["Sequence"])):
				window_Length  = len(self.data["ignoreList"][i - 15:i+15])
				
				if self.data["ignoreList"][i - 15:i+15].count("") < float(window_Length)/3:
					if self.data["ignoreList"][i] == "":
						self.data["ignoreList"][i] = "Short"
			
			self.data["ignore"] = len(self.data["ignoreList"]) - self.data["ignoreList"].count("")
		
			
			if len(self.data["Sequence"]) - self.data["ignore"] <= 10:
				print "Too few (" + str( len(self.data["Sequence"]) - self.data["ignore"]) + ") residues in search space. At least 10 needed. ",
				for i in range(0,len(self.data["Sequence"])):
					if self.data["ignoreList"][i] == "":
						self.data["ignoreList"][i] = "Few"
				
		except Exception,e:
			print "Feature masking Failed",e
			basic.writeError(e)
			
	def overlapsFeature(self,start,match,featureDict=None):
		self.inFeatureTmp = {}
		
		if featureDict == None:
			self.featureDict = self.data["Feature_by_type"]
		else:
			self.featureDict = featureDict
			
		amb = re.compile('\.\{[^\}]*\}|\[[^\]]*\]\{[^\}]*\}|\[[^\]]*\]|\.\{[0-9]\}|[A-Za-z\.\$\^]')
		ambBits = amb.findall(match.strip("^$"))
		use = filter(lambda x: ambBits[x][0] != ".", range(0,len(ambBits)))
								
		interest = ["MOD_RES",
		"MOTIF",
		"DNA_BIND",
		"VARIANT",
		"REGION",
		"ELM",
		"METAL",
		"PDB",
		"MUTAGEN",
		"SECONDARY_STRUCTURE",
		"DOMAIN",
		#"VAR_SEQ",
		"ISOFORM"]
		
		
		for featureType in interest:
			if featureType not in self.inFeatureTmp:
					self.inFeatureTmp[featureType] = []
						
			if featureType in self.featureDict:
				for feature in self.featureDict[featureType]:
						
					try:
						if feature["Start"] != "":
							
							if "End" not in feature:
								end = feature["Start"]
								
							elif feature["End"] == "":
								end = feature["Start"]
							elif feature["End"] == "*":
								end = 100000
							else:
								end = feature["End"]
								
							try:
								check = True
								if featureType in ["1000genomes","VARIANT","MUTAGEN","MOD_RES"]:
									
									defined = [x + start  + 1 for x in use]
									if int(feature["Start"]) in defined:	
										
										if featureType not in self.inFeatureTmp:
											self.inFeatureTmp[featureType] = []
									
										if feature["Start"] == end:
											self.inFeatureTmp[featureType].append("*" + feature["Desc"].replace(",", " ") + ":" + str(feature["Start"]))
										else:
											self.inFeatureTmp[featureType].append("*" + feature["Desc"].replace(",", " ") + ":" + str(feature["Start"]) + "-" + str(feature["End"]) + "")
										
										check = False
										
								if check == True:
									if basic.inFrame(start,start + len(match) - 1,int(feature["Start"]) -1,int(end) -1):
										
										if feature["Start"] == end:
											self.inFeatureTmp[featureType].append(feature["Desc"].replace(",", " ") + ":" +  str(feature["Start"]))
										else:
											self.inFeatureTmp[featureType].append(feature["Desc"].replace(",", " ") + ":" +  str(feature["Start"]) + "-" +  str(feature["End"]) + "")
										
							except Exception,e:
								print "Error checking if feature overlaps motif ",feature,e
								basic.writeError(e)
								
								
								#####????????print featureType,"E",e
								
					except Exception,e:
						print "Error with feature: "  + str(feature)
						basic.writeError(e)
				
		return self.inFeatureTmp
			
						
	def getInfo(self,accession,orthDb="",tasks=["fasta","alignment","uniprot","disorder","pfam","conservation","mask","elm","dssp","anchor"],nested=False,alignmentPath="",fetch=False):
		self.data = {}
		self.options["seqPath"] = os.path.join(self.options["fastaDir"],accession + ".fasta")
		
		if "alignmentExtension" in self.options:
			if self.options["alignmentExtension"] == "":
				ext = ".orthaln.fas"
			else:
				ext = self.options["alignmentExtension"]
		else:
			self.options["alignmentExtension"] = ""
			ext = ".orthaln.fas"
		
		if alignmentPath == "":
			if self.options["alignmentExtension"] == "":	
				self.options["alignmentPath"] = os.path.join(self.options['alignment_dir'],"ALN",accession + ext)
			else:
				self.options["alignmentPath"] = os.path.join(self.options['alignment_dir'],accession + ext)
		else:
			self.options["alignmentPath"] = alignmentPath
		
		seqCheck = []
		####################################
		
		if orthDb != "":
			orthdb = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),orthDb))
		
		if self.options["quiet"] == "F":
			print "|",
		
		###########
		# Initialise data
		###########
		if "fasta" in tasks:
			fastahelper = fastaHelper.fastaHelper()
			fastahelper.fetchSequence(accession,self.options["fastaDir"],refetch=fetch)	
			
			if self.options["quiet"] == "F":
				print accession + " loaded |",
		
		
		####################################
		if "uniprot" in tasks:
			uniprotReader = uniprotHelper.uniprotReader()
			uniprotReader.readUniprot(accession,refetch=fetch)
			seqCheck.append(uniprotReader.data["Sequence"])
			
			#basic.printDict(uniprotReader.data)
			
			if nested:
				self.data.update({"uniprot":uniprotReader.data.items()})
			else:
				self.data.update(uniprotReader.data.items())
			
				
			if self.options["quiet"] == "F":
				print "Uniprot  |",#,uniprotReader.data.keys()
					
		###########
		# Get proteins data
		###########
		####################################
		
		####################################
		if "anchor" in tasks:
			anchorhelper = anchorHelper.anchorHelper()
			anchorhelper.getAnchorScores(accession)
			
			if nested:
				self.data.update({"anchor":anchorhelper.data.items()})
			else:
				self.data.update(anchorhelper.data.items())
				
			if self.options["quiet"] == "F":
				print " Anchor |",
				
		####################################
		if "disorder" in tasks:
			disScorer = disorderHelper.disorderScorer()
			disScorer.disorderFromFile(self.options["seqPath"],"orderedResidues")
			seqCheck.append(disScorer.data["Sequence"])
			
			if nested:
				self.data.update({"disorder":disScorer.data.items()})
			else:
				self.data.update(disScorer.data.items())
			
			if self.options["quiet"] == "F":
				print "Disorder  |",
				
		####################################
		if "pfam" in tasks:
			try:
				
				self.data["Feature_by_type"]["DOMAIN"] = []
				self.data["domains"] = []
				
				pfamParser = pfam.pfamHelper()
				pfamParser.parsePfamXML(accession,refetch=fetch)
				pfamParser.calculateDomainDisorder(disorder=disScorer)
				pfamParser.tablify()
				
				if "Sequence" in pfamParser.data:
					seqCheck.append(pfamParser.data["Sequence"])
					if len(pfamParser.data["Sequence"]) != len(seqCheck[0]):
						
						pfamParser = pfam.pfamHelper()
						pfamParser.getPfamAnnotation(accession,force="T")
						pfamParser.parsePfamXML(accession)
						pfamParser.calculateDomainDisorder(disorder=disScorer)
						pfamParser.tablify()
					
				#print "=",pfamParser.data['domains']
				#print "+",pfamParser.data.items()
				if len(pfamParser.data["Sequence"]) == len(seqCheck[0]):
					if nested:
						self.data.update({"domain":pfamParser.data.items()})
					else:
						#print "HERE"
						for key in pfamParser.data:
							#print key,pfamParser.data[key]
							self.data[key] = pfamParser.data[key]
						#self.data.update(dict(pfamParser.data.items()))
				
					for domain in pfamParser.data['domains']:
						for instance in pfamParser.data['domains'][domain]["hits"]:
							if 'evalue' not in instance:
								instance['evalue'] = 1
							
							self.data["Feature_by_type"]["DOMAIN"].append({"Desc":"PFAM:" + domain  + " d=" + "%1.2f"%instance['disorder'] + " e=" + str(instance['evalue']),"Start":int(instance["start"]),"End":int(instance["end"])})
				else:
					print "Skipping PFam data - Unequal protein lengths"
					
				#print "-",self.data["domains"]
				#print self.data["Feature_by_type"]["DOMAIN"]
				
				if self.options["quiet"] == "F":
					print "Pfam |",#,pfamParser.data.keys()
			except:
				print "Pfam failed"
				
		
		####################################
		
		if "alignment" in tasks:
			if os.path.exists(self.options["alignmentPath"]) and fetch != True:
				if self.options["quiet"] == "F":
	 				print "Alignment exists |",
			else:
				
				try:
					alignmenthelper = alignmentHelper.alignmentHelper()
					alignmenthelper.makeAlignment(accession)
					if self.options["quiet"] == "F":
						print "Orthologue alignment |",
						
				except Exception,e:
					basic.writeError(e)
					
		####################################
		if "conservation" in tasks:
			try:
				if os.path.isfile(self.options["alignmentPath"]):
					conHelper = conservationHelper.conservationScorer()
				
					conHelper.scoreAlignment(self.options["alignmentPath"],disorder=disScorer,pfam=pfamParser,features=uniprotReader.data["Feature"],makeFile=True,readFile=True)
					seqCheck.append(conHelper.data["Sequence"])

					if nested:
						self.data.update({"conservation":conHelper.data.items()}) 
					else:
						self.data.update(conHelper.data.items()) 
				else:
					print "Error " + self.options["alignmentPath"] + " does not exist"

				
			except Exception,e:
				print "Error @ conservation:",e
				raise
				
				
			if self.options["quiet"] == "F":
				print "Conservation |",#,conHelper.data.keys()
	
		####################################
		if "mutation" in tasks:
			try:
				if self.mutHelper == None:
					self.mutHelper = mutationHelper.mutationHelper()
					self.mutHelper.readMutationDb()
						
				if accession in self.mutHelper.snpData:
					if 'VARIANT' in self.data["Feature_by_type"]:
						self.data["Feature_by_type"]['VARIANT'] += self.mutHelper.snpData[accession]
				
			except Exception,e:
				print "Error @ mutation:",e

		####################################
		
		if "dssp" in tasks and "PDB" in self.data['Feature_by_type']:
			try:
				structureAvailable = False
				dsspInfo = dsspHelper.dsspHelper()
				
				self.data["SurfaceAccessibility"] = {}
				self.data["SurfaceAccessibilityMax"] = {}	
				self.data["SurfaceAccessibilityMin"] = {}
				self.data["SurfaceAccessibilityNormalised"] = {}
				self.data["SequenceCheck"] = {}
				self.data["dsspPDBchains"] = {}
				
				#from http://bmerc-www.bu.edu/needle-doc/latest/dssp-progs.html
				self.maximumAccessibility = {"A":124,"B":157.5,"C":94,"D":154,"E":187,"F":221,"G":89,"H":201,"I":193,"K":214,"L":199,"M":216,"N":161,"P":130,"Q":192,"R":244,"S":113,"T":151,"V":169,"W":264,"Y":237,"X":179,"Z":189.5}

				for region in self.data['Feature_by_type']["PDB"]:
					for i in range(0,len(region['Id'].split(":"))):
						pdbId = region['Id'].split(":")[i]
						type = region['Method'].split(":")[i]
						if type != "Model":
							try:
								dsspInfo.parseDssp(pdbId)
								dsspInfo.dataByType()
								
								for chain in region['Chain'].split(":")[i].split("/"):
									structureAvailable = True
									offset = basic.alignStrings(self.data["Sequence"],"".join(dsspInfo.data["statistics"][chain]["aa"].values()))
									
								if offset != "False":
										adj = offset - min(dsspInfo.data["statistics"][chain]['solventAccessibility'])
										
										for offset in dsspInfo.data["statistics"][chain]['solventAccessibility']:
											if offset + adj in self.data["dsspPDBchains"]:
												self.data["dsspPDBchains"][offset + adj].append(pdbId + "." + chain)
											else:
												self.data["dsspPDBchains"][offset + adj] = [pdbId + "." + chain]
		
											for key in dsspInfo.data["statistics"][chain]:
												if "dssp" + key.title() not in self.data:
													self.data["dssp" + key.title()] = {}
												
												if offset + adj in self.data["dssp" + key.title()]:
													self.data["dssp" + key.title()][offset + adj].append(dsspInfo.data["statistics"][chain][key][offset])
												else:
													self.data["dssp" + key.title()][offset + adj] = [dsspInfo.data["statistics"][chain][key][offset]]
		
											if offset > min(dsspInfo.data["statistics"][chain]['solventAccessibility']) + 5 and offset < max(dsspInfo.data["statistics"][chain]['solventAccessibility']) - 5:
												if (offset + adj)  not in self.data["SurfaceAccessibility"]:
													self.data["SurfaceAccessibility"][(offset + adj)] = []
													
												self.data["SurfaceAccessibility"][(offset + adj) ].append(float(dsspInfo.data["statistics"][chain]['solventAccessibility'][offset]))
												
												if (offset + adj) not in self.data["SequenceCheck"]:
													self.data["SequenceCheck"][(offset + adj) ] = ""
			
												self.data["SequenceCheck"][(offset + adj)] += dsspInfo.data["statistics"][chain]["aa"][offset]
								
								
								for i in self.data["SurfaceAccessibility"]:
									
									if i not in self.data["SurfaceAccessibilityNormalised"]:
										self.data["SurfaceAccessibilityNormalised"][i] = []
										
									for accessibilityScore in self.data["SurfaceAccessibility"][i]:
										self.data["SurfaceAccessibilityNormalised"][i].append(min(1,float(accessibilityScore)/self.maximumAccessibility[self.data['Sequence'][i]]))
										
							except Exception,e:
								print "\nDSSP file for " + pdbId  + " not found",e
								
				
				if structureAvailable:
					self.data["dsspSecondarystructurePercent"] = {}
					self.data["dsspSecondarystructureProp"] = {}
					self.data["dsspSecondarystructurePropCode"] = {}

					for offset in self.data["SurfaceAccessibilityNormalised"]:
						samples = len(self.data["dsspSecondarystructure"][offset])
	
						self.data["SurfaceAccessibilityMax"][offset] = max(self.data["SurfaceAccessibilityNormalised"][offset])
						self.data["SurfaceAccessibilityMin"][offset] = min(self.data["SurfaceAccessibilityNormalised"][offset])
						
						
						if "" in self.data["dsspSecondarystructure"][offset]:
							while  "" in self.data["dsspSecondarystructure"][offset]:
								self.data["dsspSecondarystructure"][offset].remove("")
	
					for offset in self.data["dsspSecondarystructure"]:
						samples = len(self.data["dsspSecondarystructure"][offset])
	
						self.data["dsspSecondarystructurePercent"][offset] = str(",".join([str(x) + ":" + "%1.1f"%(100*(float(self.data["dsspSecondarystructure"][offset].count(x))/samples)) + "%" for x in set(self.data["dsspSecondarystructure"][offset])]))
						
						proportions = [(str(x) , float(self.data["dsspSecondarystructure"][offset].count(x))/samples) for x in set(self.data["dsspSecondarystructure"][offset])]
						
						self.data["dsspSecondarystructurePropCode"][offset] = [x[0] for x in proportions]
						self.data["dsspSecondarystructureProp"][offset] = [str(x[1]) for x in proportions]
			except Exception,e:
				print "Error @ dssp:", e
				
				
		
		####################################
			
		if "elm" in tasks:
			try:
				elmInfo = motifHelper.motifHelper()
				self.data["Feature_by_type"]["ELM"] = []
				for elm in elmInfo.readELMData(accession):
					self.data["Feature_by_type"]["ELM"].append({"Desc":elm['Desc'],"Start":int(elm["Start"]),"End":int(elm["End"])})
			except Exception,e:
				print "Error @ ELM:", e

		if self.options["quiet"] == "F":
			print 
			
		if "mask" in tasks:
			self.filterFeatures()
		
		if len(basic.removeRedundency(seqCheck)) > 1:
			#if self.options["quiet"] == "F":
				print "Mismatch in protein sequences", [len(x) for x in seqCheck]

		return self.data


	def buildSLiMSearchDb(self,accessionList,tasks=["uniprot","disorder","pfam","fasta","mutation","alignment","conservation",'dssp','anchor']):
		
		header = [
				"elmDesc",
				"elmID",
				"elmStart",
				"elmEnd",
				
				"domainDesc",
				"domainStart",
				"domainEnd",
				"domainEvalue",
				"domainMeanDis",
				
				"regionDesc",
				"regionStart",
				"regionEnd",
				"regionMeanDis",
				
				
				"variantDesc",
				"variantStart",
				"variantEnd",
				"variantFrom",
				"variantTo",
				"variantdbSNP",
				
				
				"1000genomesDesc",
				"1000genomesStart",
				"1000genomesEnd",
				"1000genomesFrom",
				"1000genomesTo",
				"1000genomesId",
				"1000genomesAllelleFreq",
				"1000genomesDepth",
				"1000genomesr2",
				
				
				"mutagenDesc",
				"mutagenStart",
				"mutagenEnd",
				
				"isoformDesc",
				"isoformStart",
				"isoformEnd",
				"isoformNo",
				"isoformId",
				"isoformFrom",
				"isoformTo",
				"isoformType",
				
				"pdbDesc",
				"pdbId",
				"pdbStart",
				"pdbEnd",
				"pdbChain",
				"pdbMethod",
				"pdbResolution",
				
				
				"secondaryStructureDesc",
				"secondaryStructureStart",
				"secondaryStructureEnd",
				
				"biasDesc",
				"biasStart",
				"biasEnd",
	
				"modDesc",
				"modStart"
				]
	
		tableStr = "uniprotAcc\tproteinSequence\tcsTreeWeightedConservationScore\trlcConservationScore\trlcConservationScoreProbability\tspeciesUsedInConserationScores\tsolventAccessibility\tsecondaryStructureCode\tsecondaryStructureProp\tiupredDisorderResidueScore\tanchorScore\tmasking\t" + "\t".join(header) + "\n"
		
		count = 0
		elmInfo = motifHelper.motifHelper()
		startTime = time.time()
		for accession in accessionList[0:]:
			try:
				count += 1
					
				elapsedTime = time.time() - startTime
				print  accession + "\t" +  str(int(elapsedTime/(60*60))) + ":" +  "%d"%int((elapsedTime/60)%60) + ":" + "%d"%int(elapsedTime%60),"\t", str(count) + "/" + str(len(accessionList))
				
				#data = proteinInfo.getInfo(accession,tasks=["uniprot","disorder","pfam","fasta"])
				
				#"alignment","conservation",
				
				data = self.getInfo(accession,tasks=tasks) #"mutation" slow for some reason / Deletes the searchDB each time
				
				###################################################################################
				
				tableStr += accession + "\t"
				tableStr += data["Sequence"] + "\t"
				
				
				try:
					tableStr += ",".join(["%1.3f"%x for x in data["WCS"]]) + "\t"
				except:
					tableStr += ",".join(["0"]*len(data["Sequence"])) + "\t"
				
				try:
					tableStr += ",".join(["%1.3f"%x for x in data["WCS_W_rStdev"]]) + "\t"
				except:
					tableStr += ",".join(["0"]*len(data["Sequence"])) + "\t"
					
				try:
					tableStr += ",".join(["%1.3f"%x for x in data["WCS_W_p"]]) + "\t"
				except:
					tableStr += ",".join(["1"]*len(data["Sequence"])) + "\t"
					
				try:
					tableStr +=  str(int(data["SpeciesCount"]))  + "\t"
				except:
					tableStr +=  "1\t"

				if "dsspSecondarystructurePropCode" in data:
					accessibilityStr = []
					accessibilityExpandedStr = []
					secondaryStructureStr = []
					secondaryStructurePropStr = []
					for i in range(0,len(data["Sequence"])):
						if i in data["dsspSecondarystructurePropCode"]:
							secondaryStructureStr.append(":".join(data["dsspSecondarystructurePropCode"][i]))
							secondaryStructurePropStr.append( ":".join(["%1.2f"%float(x) for x in data["dsspSecondarystructureProp"][i]]))
						else:
							secondaryStructureStr.append("NA")
							secondaryStructurePropStr.append("NA")
						
						if i in data["SurfaceAccessibilityMax"]:
							accessibilityStr.append("%1.2f"%data["SurfaceAccessibilityMax"][i])
						else:
							accessibilityStr.append("NA")
							
						if i in data["SurfaceAccessibilityNormalised"]:
							accessibilityExpandedStr.append(":".join(["%1.2f"%x for x in data["SurfaceAccessibilityNormalised"][i]]))
						else:
							accessibilityExpandedStr.append("NA")
				else:
					accessibilityStr = ["NA"]*len(data["Sequence"])
					secondaryStructureStr = ["NA"]*len(data["Sequence"])
					secondaryStructurePropStr =["NA"]*len(data["Sequence"])
				
				tableStr += ",".join(accessibilityStr) + "\t"
				tableStr += ",".join(secondaryStructureStr)  + "\t"
				tableStr += ",".join(secondaryStructurePropStr)  + "\t"
 
 				try:
					tableStr += ",".join(["%1.2f"%x for x in data["ResidueDisorder"]]) + "\t"
				except:
					tableStr += ",".join(["1"]*len(data["Sequence"])) + "\t"
					
				try:
					tableStr += ",".join(["%1.2f"%x for x in data["anchorScore"]]) + "\t"
				except:
					tableStr += ",".join(["1"]*len(data["Sequence"])) + "\t"
				
				###################################################################################
				
				proteinData = {
				"domain":{"Start":[],"End":[],"MeanDis":[],"Desc":[],"Evalue":[]},
				"mod":{"Start":[],"Desc":[]},
				"elm":{"Start":[],"ID":[],"End":[],"Desc":[]},
				"secondaryStructure":{"Start":[],"End":[],"Desc":[]},
				"bias":{"Start":[],"End":[],"Desc":[]},
				"variant":{"Start":[],"End":[],"Desc":[],"From":[],"To":[],"dbSNP":[]},
				"1000genomes":{"Start":[],"End":[],"Desc":[],"From":[],"To":[],'AllelleFreq':[],'Depth':[],'r2':[],'Id':[]},
				"mutagen":{"Start":[],"End":[],"Desc":[],"PMID":[],"Ref":[]},
				"region":{"Start":[],"End":[],"MeanDis":[],"Desc":[]},
				"pdb":{"Desc":[],"Id":[],"Method":[],"Resolution":[],"Chain":[],"Start":[],"End":[]},
				"isoform":{"Desc":[],"Start":[],"End":[],"No":[],"Id":[],"From":[],"To":[],"Type":[]}
				}
				
				
				for elm in elmInfo.readELMData(accession):
					proteinData["elm"]["Desc"].append(elm['Desc'].replace(",",""))
					proteinData["elm"]["ID"].append(elm['ELMID'])
					proteinData["elm"]["Start"].append(str(elm["Start"]))
					proteinData["elm"]["End"].append(str(elm["End"]))
					
				
				for feature_type in data["Feature_by_type"]:
					for feature in data["Feature_by_type"][feature_type]:
						try:
							start =  str(feature['Start'])
							end = str(feature['End'])
							bits = feature['Desc'].split(" ")
							
							if feature_type in ["COMPBIAS",'REPEAT']:
								proteinData["bias"]["Desc"].append(feature_type.lower())
								proteinData["bias"]["Start"].append(start)
								proteinData["bias"]["End"].append(end)
								
							if feature_type in ["STRAND","HELIX"]:
								proteinData["secondaryStructure"]["Desc"].append(feature_type.lower())
								proteinData["secondaryStructure"]["Start"].append(start)
								proteinData["secondaryStructure"]["End"].append(end)
	
							try:
								meanDis = "%1.3f"%(sum(data["ResidueDisorder"][int(start):int(end)])/(int(end) - int(start)))
							except:
								meanDis = str(0)
						
							if feature_type in ["TRANSMEM"]:
								proteinData["domain"]["Desc"].append(feature_type)
								proteinData["domain"]["Start"].append(start)
								proteinData["domain"]["End"].append(end)
								proteinData["domain"]["MeanDis"].append(meanDis)
								proteinData["domain"]["Evalue"].append("0")
								
								
							if feature_type in ["TOPO_DOM"]:
								name = feature['Desc'].split()[0]
								if name == "Extracellular":
									proteinData["domain"]["Desc"].append(name)
									proteinData["domain"]["Start"].append(start)
									proteinData["domain"]["End"].append(end)
									proteinData["domain"]["MeanDis"].append(meanDis)
									proteinData["domain"]["Evalue"].append("0")
									
									
							if feature_type in ["MOD_RES","LIPID","CROSSLNK","CARBOHYD"] :#Okay
								proteinData["mod"]["Desc"].append(feature["Desc"].strip(" ."))
								proteinData["mod"]["Start"].append(str(start))
								
							if feature_type in ["VARIANT"]:
								#print feature
								if feature['Source'] == "1000genomes":
									try:
										if len(data["Sequence"]) >= int(feature["Start"]):
											#print data["Sequence"]
											#print data["Sequence"][int(feature["Start"])-1], feature["From"], data["Sequence"][int(feature["Start"])-1], feature["To"]
											if data["Sequence"][int(feature["Start"])-1] == feature["From"] or data["Sequence"][int(feature["Start"])-1] == feature["To"]:
												proteinData["1000genomes"]["Start"].append(str(feature["Start"]))
												proteinData["1000genomes"]["End"].append(str(end))
												
												if "From" not in feature:
													feature["From"] = "-"
													feature["To"] = "-"
													
												if feature["From"] == "-":
													feature["From"] = data["Sequence"][int(feature["Start"])]
													
												
												proteinData["1000genomes"]["From"].append(feature["From"].strip(" ."))
												proteinData["1000genomes"]["To"].append(feature["To"].strip("."))
												proteinData["1000genomes"]["Desc"].append(feature['Desc'].strip(" ."))
												proteinData["1000genomes"]["Id"].append(feature['id'])
												proteinData["1000genomes"]['AllelleFreq'].append(feature['id'])
												proteinData["1000genomes"]['Depth'].append(feature['id'])
												proteinData["1000genomes"]['r2'].append(feature['id'])
											else:
												print data["Sequence"][int(feature["Start"])-2:int(feature["Start"])+2],feature["From"]
												print "1000 genomes variant does not match sequence (Mismatch)."#,feature
										else:
												print "1000 genomes variant does not match sequence (Length)."#, len(data["Sequence"]),feature
												
									except Exception,e:
										print "Error parsing 1000 genomes feature",feature
										basic.writeError(e)
								else:
									proteinData["variant"]["Start"].append(str(feature["Start"]))
									proteinData["variant"]["End"].append(str(end))
									
									if "From" not in feature:
										feature["From"] = "-"
										feature["To"] = "-"
										
									if feature["From"] == "-":
										feature["From"] = data["Sequence"][int(feature["Start"])]
										
									proteinData["variant"]["From"].append(feature["From"].strip(" ."))
									proteinData["variant"]["To"].append(feature["To"].strip("."))
									proteinData["variant"]["Desc"].append(feature["Desc"].strip(" .").replace(",",""))
									
									if "dbSNP" not in feature:
										proteinData["variant"]["dbSNP"].append("-")
									else:
										proteinData["variant"]["dbSNP"].append(feature["dbSNP"])
									
							if feature_type in ["ISOFORM"]:
								proteinData["isoform"]["Desc"].append(feature['Desc'].replace(",",""))
								proteinData["isoform"]["Start"].append(start)
								proteinData["isoform"]["End"].append(end )
								proteinData["isoform"]["No"].append(feature['Isoforms'])
								proteinData["isoform"]["Id"].append(feature['FTid'])
								proteinData["isoform"]["From"].append(feature['From'].replace(" ",""))
								proteinData["isoform"]["To"].append(feature['To'].replace(" ",""))
								proteinData["isoform"]["Type"].append(feature['Type'])
							
							if feature_type in ["MUTAGEN"]:
								try:
									pmid =  "|".join(basic.removeRedundency(feature['PMID']))
								except:
									pmid = "NA"
									
								proteinData["mutagen"]["Desc"].append(feature['Desc'].replace(";","").replace(",","").replace("in dbSNP:","").strip(".") + ":" + pmid)
								proteinData["mutagen"]["Start"].append(str(start))
								proteinData["mutagen"]["End"].append(str(end))
								
							if feature_type in ["REGION","MOTIF","ACT_SITE","SITE","DNA_BIND","BINDING","REPEAT","METAL",'SIGNAL']:
								
								proteinData["region"]["Desc"].append(feature['Desc'].split("(")[0].strip(" .").replace(",",""))
								proteinData["region"]["Start"].append(str(start))
								proteinData["region"]["End"].append(str(end))
								proteinData["region"]["MeanDis"].append(str(meanDis))
					
							
							if feature_type in ["PDB"]:
								proteinData["pdb"]["Id"].append(feature["Id"])
								proteinData["pdb"]["Method"].append(feature["Method"])
								proteinData["pdb"]["Resolution"].append(feature["Resolution"])
								proteinData["pdb"]["Chain"].append(feature["Chain"])
								proteinData["pdb"]["Start"].append(feature["Start"])
								proteinData["pdb"]["End"].append(end)
								proteinData["pdb"]["Desc"].append(feature["Desc"].replace(",",""))
								
						except Exception,e:	
							basic.writeError(e)

				if "domains" in data:
					for domain in data["domains"]:
						for hit in data["domains"][domain]["hits"]:
							hit["start"] = int(str(hit["start"]).replace(">","").replace("<",""))
							hit["end"] = int(str(hit["end"]).replace(">","").replace("<",""))
							
							try:
								meanDis = "%1.3f"%(sum(data["ResidueDisorder"][int(hit["start"]):int(hit["end"])])/(int(hit["end"]) - int(hit["start"])))
							except:
								meanDis = 0
								
							proteinData["domain"]["Desc"].append(domain)
							proteinData["domain"]["Start"].append(str(hit["start"]))
							proteinData["domain"]["End"].append(str(hit["end"] ))
							proteinData["domain"]["MeanDis"].append(meanDis)
							
							if "evalue" in hit:
								proteinData["domain"]["Evalue"].append(hit["evalue"])
							else:
								proteinData["domain"]["Evalue"].append("1")
				

				#basic.printDict(proteinData["isoform"])
				
				###################################################################################
				
				#0 unmasked
				#1 less than IUPred cutoff
				#2 in domain
				#3 transmembrane
				#4 extracellular
				
				
				maskBitsList = list(itertools.repeat("0", len(data["Sequence"])))
				sequenceRange = range(len(data["Sequence"])) 
				
				if len(data["ResidueDisorder"]) ==  len(data["Sequence"]):
					maskBitsList = ["1" if data["ResidueDisorder"][x] < self.options['disorderCutoff'] else "0" for x in sequenceRange]
				
				for i in range(len(proteinData["domain"]["Start"])):
					try:
						pdb = False
						maskBasedonPDB = False
						
						if maskBasedonPDB:
							for j in range(len(proteinData["pdb"]["Start"])):
								try:
									if basic.inFrame(proteinData["domain"]["Start"][i],proteinData["domain"]["End"][i],proteinData["pdb"]["Start"][j],proteinData["pdb"]["End"][j]):
										pdb = True
										print proteinData["domain"]["Start"][i],proteinData["domain"]["End"][i],proteinData["pdb"]["Start"][j],proteinData["pdb"]["End"][j]
								except:
									pass
							
						if proteinData["domain"]["Desc"][i] == "TRANSMEM":
							domainRange = range(int(proteinData["domain"]["Start"][i])-1,int(proteinData["domain"]["End"][i]))
							maskBitsList = ["3" if x in domainRange else maskBitsList[x] for x in sequenceRange]
							
						elif proteinData["domain"]["Desc"][i] == "Extracellular":
							domainRange = range(int(proteinData["domain"]["Start"][i])-1,int(proteinData["domain"]["End"][i]))
							maskBitsList = ["4" if x in domainRange else maskBitsList[x] for x in sequenceRange]
							
						elif  (float(proteinData["domain"]["MeanDis"][i]) < self.options['disorderDomainCutoff'] and float(proteinData["domain"]["Evalue"][i]) < float(self.options['pfamCutoff'])):
							domainRange = range(int(proteinData["domain"]["Start"][i])-1,int(proteinData["domain"]["End"][i]))
							maskBitsList = ["2" if x in domainRange else maskBitsList[x] for x in sequenceRange]
						elif pdb == True:
							domainRange = range(int(proteinData["domain"]["Start"][i])-1,int(proteinData["domain"]["End"][i]))
							maskBitsList = ["5" if x in domainRange else maskBitsList[x] for x in sequenceRange]
					except Exception,e:
						print "Error creating masking data",e
						basic.writeError(e)
						
				tableStr += ",".join(maskBitsList)
				
				###################################################################################
						
				outDict = {}
				
				for type in proteinData.keys():
					
					outDict[type] = {}
					for val in proteinData[type]:
						outDict[type + val] = ",".join([str(x).replace(",","|") for x in proteinData[type][val]])
					
				for key in header:
					if len(outDict[key]) > 0:
						tableStr += "\t" + outDict[key]
					else:
						tableStr += "\t" + "NA"
				
				#print tableStr
				#print tableStr
				tableStr += "\n"
				#print len(tableStr.split("\n")[-2].split("\t"))
				###################################################################################
			except Exception,e:
				basic.writeError(e)
				
		open(self.options["searchDbOut"],"w").write(tableStr)
		

		lines = tableStr.split("\n")
		for line in lines[:-1]:
			bits = line.split("\t")
			if len(lines[0].split("\t")) != len(bits):
				print len(lines[0].split("\t")),len(bits)
				print line
			#try:
			#	print bits[19]
			#except:
			#	print line

		del data
		del tableStr

	def tablify(self,offsets=[]):
		length = len(self.data['Sequence'])
		dataTypes = [
		'Sequence',
		'ResidueDisorder',
		'WCS',
		'WCS_W_rStdev',
		'SurfaceAccessibilityMax',
		'SurfaceAccessibilityMin',
		"dsspSecondarystructurePercent",
		]

		tableStr = "Offset\t" + "\t".join(dataTypes) + "\n"
		if offsets == []:
			offsets = range(length)

		for i in offsets:

			tableStr += str(i + 1) + "\t"
			for dataType in dataTypes:
				try:
					tableStr += str(self.data[dataType][int(i)]) + "\t"
				except:
					tableStr += "NA\t"
			
			
			tableStr += "\n"

		print tableStr


if __name__ == "__main__":
	
	######################################
	
	makeProteinTable = False
	makeDatabase = False
	makeKeywordDatabase = False
	makeGoDatabase = False
	
	updateProteins = True

	if makeProteinTable:
		acc = sys.argv[1]

		offsets = []#[int(x) - 1 for x in sys.argv[3].split(",")]
		proteinInfo = proteinInfoHelper()

		if len(sys.argv) > 2:
			alignmentPath = sys.argv[2]
			proteinInfo.getInfo(acc,alignmentPath=alignmentPath )
		else:
			proteinInfo.getInfo(acc)

		proteinInfo.tablify(offsets=offsets)

	######################################
	if updateProteins:
		proteinList = sys.argv[1].split(",")
		proteinInfo = proteinInfoHelper()
		
		for protein in proteinList:
			proteinInfo.getInfo(protein,tasks=["fasta","uniprot","disorder","pfam","alignment","conservation"],fetch=True)
		

	if makeDatabase:
		table = sys.argv[1]
	
		if len(sys.argv) > 3:
			columnName = sys.argv[3]
		else:
			columnName = 'uniprotid'
			
		tableData = basicReader.readTableFile(table)
		accessionList = basic.removeRedundency(tableData[columnName])
		
		try:
			proteinInfo = proteinInfoHelper()
			#proteinInfo.options["quiet"] = "F"
			proteinInfo.options["searchDbOut"] = sys.argv[2]
			
			tasks = ["uniprot","disorder","fasta"]#,"pfam","mutation",'dssp','anchor']
			#tasks = ["uniprot","disorder","pfam","fasta",'dssp','anchor']
			proteinInfo.buildSLiMSearchDb(accessionList,tasks)
			#proteinInfo.getInfo("P04637",tasks=["fasta","uniprot","disorder","pfam","alignment","conservation"])
		except Exception,e:
			print e
			raise
	