import os,urllib,sys,re,copy
import ned_basic as basic
from ned_sortedFileSearcher import Searcher
import ned_commandLine as commandline

import ned_structureHelper as structureHelper

try:
	import rje_seq,rje_uniprot
	
except Exception,e:
	print e
	sys.exit()


timeLimit = 120


class uniprotDownloadHelper():			
	def __init__(self):
		cmdline = commandline.CommandLine()
		self.options = cmdline.loadIniFile(os.path.join(os.path.dirname(os.path.realpath(__file__)),"../settings/utilities.ini"))
		self.data = {}
	
	
	def downloadUniprot(self,accession,uniprotDir,refetch=False,timeLimit=100000):
		try:
			fetch = False
			
			if os.path.exists(os.path.join(uniprotDir,accession + ".dat")):
				if basic.fileAgeDays(os.path.join(uniprotDir,accession + ".dat")) > timeLimit:
					fetch = True
					print os.path.join(uniprotDir,accession + ".dat") + "greater than " + str(timeLimit) + " days old. Refetching"
			else:
				fetch = True
			
			if refetch:
				fetch = True
			
				
			if fetch:
					print os.path.join(uniprotDir,accession + ".dat") + " not found. Downloading data from uniprot"
					
					url = "http://www.ebi.ac.uk/Tools/webservices/rest/dbfetch/uniprotkb/"
			
					opener = urllib.FancyURLopener()
					f = opener.open(url + accession)
					
					open(os.path.join(uniprotDir, accession + ".dat"),"w").write(f.read())
			else:
				pass
				
		except:
			raise
			pass
			
class uniprotReader():
		def __init__(self):
			cmdline = commandline.CommandLine()
			self.options = cmdline.loadIniFile(os.path.join(os.path.dirname(os.path.realpath(__file__)),"../settings/utilities.ini"))
			self.data = {}
			self.downloadHelper = None
		
		def reformatFeatures(self):
			self.data['Feature_by_type'] = {}
			for feature in self.data['Feature']:
				if feature['Type'] not in self.data['Feature_by_type']:
					self.data['Feature_by_type'][feature['Type']] = []
				
				self.data['Feature_by_type'][feature['Type']].append(feature)
		
		def parseReferences(self):
			aa_triplets = ['ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']
			mutagenParse = re.compile("[A-Z]{3}-[0-9]+")
			mutagenStretchParse = re.compile("[0-9]+-[A-Z]{3}--[A-Z]{3}-[0-9]+")
			pubmedParse = re.compile("PubMed=[^;]+")
			
			#print self.data['Feature_by_type']['MUTAGEN']
			
			
			for ref in  self.data['Reference']:
				
				try:
					pmid = ""
					if 'MUTAGEN' in self.data['Feature_by_type']:
						refDesc = ""
						if 'RP' in ref:
							refDesc += ref['RP'].strip(" .") + ":"
						else:
							refDesc += ":"
	
						if 'RX' in ref:
							pmid = pubmedParse.findall(ref['RX'])[0].split("=")[1]
							refDesc += pmid  + ":"
						else:
							refDesc += ":"
	
	
						if "RT" in ref:
							refDesc += ref['RT'].strip(' ";')+ ""
						else:
							refDesc += ":"
	
						if 'RP' in  ref:
							for mutagen in mutagenParse.findall(ref['RP']):
								mutagenBits =  mutagen.split("-")
								pos = mutagenBits[1]
								
								for i in range(0,len(self.data['Feature_by_type']['MUTAGEN'])):
									site = self.data['Feature_by_type']['MUTAGEN'][i]
									
									if "PMID" not in self.data['Feature_by_type']['MUTAGEN'][i]:
										self.data['Feature_by_type']['MUTAGEN'][i]["PMID"] = []
										
									if "ref" not in self.data['Feature_by_type']['MUTAGEN'][i]:
										self.data['Feature_by_type']['MUTAGEN'][i]["ref"] = []
									
										
										
									if site["Start"] == int(pos) and site["End"] == int(pos):
										self.data['Feature_by_type']['MUTAGEN'][i]["ref"].append(refDesc)
										
										self.data['Feature_by_type']['MUTAGEN'][i]["PMID"].append(pmid)
										
							for mutagen in mutagenStretchParse.findall(ref['RP']):
								mutagenBits =  mutagen.split("-")
								start = mutagenBits[0]
								stop  = mutagenBits[-1]
	
								for i in range(0,len(self.data['Feature_by_type']['MUTAGEN'])):
									if "PMID" not in self.data['Feature_by_type']['MUTAGEN'][i]:
										self.data['Feature_by_type']['MUTAGEN'][i]["PMID"] = []
										
									if "red" not in self.data['Feature_by_type']['MUTAGEN'][i]:
										self.data['Feature_by_type']['MUTAGEN'][i]["ref"] = []
									
									site = self.data['Feature_by_type']['MUTAGEN'][i]
									if site["Start"] == int(start) and site["End"] == int(stop):
										self.data['Feature_by_type']['MUTAGEN'][i]["ref"].append(refDesc)
										
										self.data['Feature_by_type']['MUTAGEN'][i]["PMID"].append(pmid)
				except:
					basic.writeError(e)
			#sys.exit()

		def parseMutagens(self):
			if "MUTAGEN" in self.data['Feature_by_type']:
				mutagens = copy.deepcopy(self.data['Feature_by_type']["MUTAGEN"])

				self.data['Feature_by_type']["MUTAGEN"] = []
				
				while len(mutagens) > 0:
					mutagen = mutagens.pop()
					overlap = False
					start = mutagen["Start"]
					end = mutagen["End"]
					tmpMutagen = {}

					for i in range(0,len(self.data['Feature_by_type']["MUTAGEN"])):
						
						if start == self.data['Feature_by_type']["MUTAGEN"][i]["Start"] and end == self.data['Feature_by_type']["MUTAGEN"][i]["End"]:
							overlap = True
							self.data['Feature_by_type']["MUTAGEN"][i]["Desc"]  += "/ " + mutagen["Desc"]
						
					if overlap == False:
						tmpMutagen["Start"] = start
						tmpMutagen["End"] = end
						tmpMutagen["Desc"] = mutagen["Desc"]
						
						self.data['Feature_by_type']["MUTAGEN"].append(tmpMutagen)
					
						#print self.data['Feature_by_type']["PDB"]
			#basic.printDict(self.data['Feature_by_type']["MUTAGEN"])
			
		def parsePDB(self):
			if "PDB" in self.data["DBLinks"]:
				self.data['Feature_by_type']["PDB"] = []
				for pdb in self.data["DBLinks"]['PDB']:
					#print pdb
					bits = pdb.split(";")
					#print bits
					if len(bits) == 4:
						
						for chain in bits[3].split(','):
							tmpPDB = {}
							
							start = int(chain.split("=")[1].split("-")[0])
							end = int(chain.split("=")[1].split("-")[1].strip(" ."))
							
							#print bits[0],chain,start,end
							overlap = False
							for i in range(0,len(self.data['Feature_by_type']["PDB"])):
								if start == self.data['Feature_by_type']["PDB"][i]["Start"] and end == self.data['Feature_by_type']["PDB"][i]["End"]:
								
									overlap = True
									self.data['Feature_by_type']["PDB"][i]["Id"] += ":" + bits[0]
									self.data['Feature_by_type']["PDB"][i]["Method"]  += ":" +  bits[1].strip(" .")
									self.data['Feature_by_type']["PDB"][i]["Resolution"]  += ":" +  bits[2].strip(" .").replace(" ","")
									self.data['Feature_by_type']["PDB"][i]["Chain"]  += ":" + chain.split("=")[0].strip(" .")
									self.data['Feature_by_type']["PDB"][i]["Desc"]  += "/" +  bits[0] + " " +  bits[1].strip(" .") + "@" + bits[2].strip(" .").replace(" ","")
								
							if overlap == False:
								tmpPDB["Id"] = bits[0]
								tmpPDB["Method"] = bits[1].strip(" .")
								tmpPDB["Resolution"] = bits[2].strip(" .").replace(" ","")
								tmpPDB["Chain"] = chain.split("=")[0].strip(" .")
								tmpPDB["Start"] = start
								tmpPDB["End"] = end
								tmpPDB["Desc"] = tmpPDB["Id"] + " " + tmpPDB["Method"] + "@" + tmpPDB["Resolution"]
								
								
								self.data['Feature_by_type']["PDB"].append(tmpPDB)
							
							#print self.data['Feature_by_type']["PDB"]
			
				
		def parseDisulfide(self):
			if 'DISULFID' in self.data['Feature_by_type']:
				tmpDisufides = []
				while len(self.data['Feature_by_type']["DISULFID"]) > 0:
					pair = self.data['Feature_by_type']["DISULFID"].pop()
					
					tmpDisufides.append({"Start":int(pair["Start"]),"End":int(pair["Start"]),"Desc":"Disulfide bonded to " + str(pair["End"])})
					tmpDisufides.append({"Start":int(pair["End"]),"End":int(pair["End"]),"Desc":"Disulfide bonded to " + str(pair["Start"])})
				
				self.data['Feature_by_type']['DISULFID'] = tmpDisufides
							
		def parseSecondaryStructure(self):
			self.data['Feature_by_type']['SECONDARY_STRUCTURE'] = []
			
			for secondaryStructure in ["HELIX","STRAND","TURN"]:
				if secondaryStructure in self.data['Feature_by_type']:
					for piece in self.data['Feature_by_type'][secondaryStructure]:
						self.data['Feature_by_type']['SECONDARY_STRUCTURE'].append({"Start":int(piece["Start"]),"End":int(piece["End"]),"Desc":secondaryStructure})
			
			
			
		def parseIsoforms(self):
			self.data['Feature_by_type']['ISOFORM'] = []
			if "VAR_SEQ" in self.data['Feature_by_type']:
				
				variationPattern = re.compile("([A-Z]+ -> [A-Z]+)|(Missing )")
				FTIdPattern = re.compile(" /FTId=VSP_[0-9]+")
				isoformPattern = re.compile("isoform [0-9A-Z]+")
				#print len(self.data['Feature_by_type']["VAR_SEQ"])
				for isoform in self.data['Feature_by_type']["VAR_SEQ"]:
					desc =  isoform['Desc']
					
					#print isoform
					variation = variationPattern.findall(desc)
					FTid = FTIdPattern.findall(desc)
					isoformNo = isoformPattern.findall(desc)
					
					if variation[0][0] == "":
						variation = variation[0][1]
						fromRes = self.data["Sequence"][int(isoform["Start"]) -1:int(isoform["End"])]
						toRes = ""
						type = "Skipped Exon"
					else:
						variation = variation[0][0]
						bits = variation.split(" -> ")
						fromRes = bits[0]
						toRes = bits[1]
						type = "Alternative splicing"
					
					
					isoformNo = "/".join([i.split()[1] for i in isoformNo])
					#print isoformNo, isoformNo 
					FTid = FTid[0].replace("/FTId=","")
					
					#self.data['Feature_by_type']['VAR_SEQ'].remove(isoform)
					self.data['Feature_by_type']['ISOFORM'].append({"Start":isoform["Start"],"End":isoform["End"],"From":fromRes,"To":toRes,"Desc":desc,"Isoforms":isoformNo,"FTid":FTid,"Type":type})
			
			
		def parseVariants(self):
			dbSNPpattern = re.compile("dbSNP:rs[0-9]+")
			tmpDict = []
			
			roundBrackets = re.compile("\([^\)]+\)")
			
			if 'VARIANT' in self.data['Feature_by_type']:
				
				while len(self.data['Feature_by_type']['VARIANT']) > 0:
					try:
						variant = self.data['Feature_by_type']['VARIANT'].pop()
						
						desc_bits = variant['Desc'].split()
						#print desc_bits
						
						if desc_bits[0] == "Missing.":
							fromRes = "-"
							toRes = "-"
						else:
							fromRes = desc_bits[0]
							toRes = desc_bits[2]
						
						
						if variant['Desc'].count('dbSNP:') > 0:
							dbSNP = dbSNPpattern.findall(variant['Desc'])[0].replace("dbSNP:","")
							variant['Desc'] = variant['Desc'].replace('(in dbSNP:' + dbSNP + ")","")
						
						phenotypes =  roundBrackets.findall(variant['Desc'])
						
						if len(phenotypes) > 0:
							desc =   toRes + "(" + fromRes+ ")" + ":" +  phenotypes[0][1:-1]
						else:
							desc = toRes + "(" + fromRes+ ")"
						
						dbSNP = ""
						
						try:					
							FTid = desc_bits[-1].split("=")[1][:-1]
						except:
							FTid = ""
							
						sub_type = "non-synonymous"
						if fromRes == toRes:
							sub_type = "synonymous"
						
						tmpDict.append({"Start":int(variant["Start"]),"End":int(variant["End"]),"From":fromRes,"To":toRes,"Desc":desc,"Id":FTid,"dbSNP":dbSNP,"subType":sub_type,"Source":"Uniprot"})
					
					except Exception,e:
						print "Error: " , e,variant
						basic.writeError(e)
						
			self.data['Feature_by_type']['VARIANT'] = tmpDict
	
		def readUniprot(self,accession,uniprotDir="",refetch=False):
			
			if uniprotDir != "":
				self.options["uniprot_dir"] = uniprotDir
			
			filePath = os.path.join(self.options["uniprot_dir"],"dat",accession + ".dat")
			
			try:
				
				reason = ""
				
				if not os.path.exists(filePath):
					refetch = True
					reason = "(Not exist)"
				elif basic.fileAgeDays(filePath) >= timeLimit:
					refetch = True
					reason = "(Over " +  str(timeLimit) + " days old)"
				if refetch:
					print filePath + " refetching " + reason + "."
					
					if self.downloadHelper == None:
						self.downloadHelper = uniprotDownloadHelper()
					
					self.downloadHelper.downloadUniprot(accession,os.path.join(self.options["uniprot_dir"],"dat"),timeLimit)
					
				
				try:
					if len(open(filePath,"r").read().split("\n")) > 2:					
						seqs = rje_uniprot.UniProt(cmd_list=['v=-1','i=-1','replacechar=F','stripnum=F',"fullref=T"])
						seqs.readUniProt(filePath)
						
						entry = seqs.list['Entry'][0]	
						
						self.data.update(dict(entry.list.items() + entry.dict.items()  + entry.obj['Sequence'].info.items()))#+ entry.info.items()
						
						#basic.printDict(self.data['Feature'])
						
						try:
							self.reformatFeatures()
							
							self.parseVariants()
							self.parseIsoforms()
							self.parsePDB()
							self.parseDisulfide()
							self.parseSecondaryStructure()
							self.parseMutagens()
							try:
								self.parseReferences()
							except:
								pass
								
						except Exception,e:
							print "Feature conversion failed",e
							
						return self.data
						
				except Exception,e:
					print "Uniprot reading failed",e
					
			except:
				raise
				pass
			
class uniprotHelper():
	def __init__(self):
		cmdline = commandline.CommandLine()
		self.options = cmdline.loadIniFile(os.path.join(os.path.dirname(os.path.realpath(__file__)),"../settings/utilities.ini"))
		self.data = {}
			
	def __init__(self,options={}):
		self.options = options
		
		self.searchDescMapper = Searcher(self.options["uniprotDescriptionMapper"])
		
	def findDesc(self,acc):
		result = self.searchDescMapper.find(acc)
		
		try:
			return result[0].strip().split("\t")
		except Exception,e:
			return ["?"]*5
			
	def downloadUniprot(self,accession,uniprotDir,refetch=False,timeLimit=100000):
		try:
			
			fetch = False
			
			if os.path.exists(os.path.join(self.options["filter_uniprot"],self.data["Acc"] + ".dat")):
				if basic.fileAgeDays(os.path.join(self.options["filter_uniprot"],self.data["Acc"] + ".dat")) > timeLimit:
					fetch = True
			
			if refetch:
				fetch = True
			
				
			if fetch:
				
				if options["quiet"] == "F":	
					print os.path.join(self.options["filter_uniprot"],self.data["Acc"] + ".dat") + " not found. Downloading data from uniprot",
				
				url = "http://www.ebi.ac.uk/Tools/webservices/rest/dbfetch/uniprotkb/"
		
				opener = urllib.FancyURLopener()
				f = opener.open(url + self.data["Acc"])
				
				open(os.path.join(self.options["filter_uniprot"],self.data["Acc"] + ".dat"),"w").write(f.read())
				
				
				if options["quiet"] == "F":		
					print "Done"
		except:
			pass
			
			
if __name__ == "__main__":
	uniprotReader = uniprotReader()
	
	proteinList = os.listdir("/Applications/Bioware/Datasets/Uniprot/dat/")
	proteinList.sort()
	proteinList.reverse()
	
	print  "accession	start	stop	length	pdb	peptide	pubmedtitle	pmid"
	for file in proteinList:
		try:
			acc = file.split(".")[0]
			
			uniprotReader.readUniprot(acc)
			pdbParser = structureHelper.structureHelper()
	
			if 'Feature_by_type' in uniprotReader.data:
				if "PDB" in uniprotReader.data['Feature_by_type']:
					for entry in uniprotReader.data['Feature_by_type']["PDB"]:	
						if int(entry["End"]) - int(entry["Start"]) < 20:
							for pdbId in entry["Id"].split(":"):
								pdbParser.parsePDBfile(pdbId)
								print acc,"\t",entry["Start"],"\t",entry["End"],"\t",int(entry["End"]) - int(entry["Start"]),"\t",pdbId,"\t",uniprotReader.data["Sequence"][int(entry["Start"])-1:int(entry["End"])],"\t",pdbParser.data["pubmed"]['TITL'],"\t",pdbParser.data["pubmed"]['PMID']
		except Exception,e:
			pass#print e
			#print pdbParser.data
							
							