import re,copy,sys,fileinput, hashlib,os, urllib

class fastaHelper():
	def __init__(self):
		self.options = {}
		self.sequence = ""
		
		
	def readFastaDb(self,dbFile):
		f = open(dbFile, 'rb')
	
		sequence  = ""
		line = f.readline().strip()
		id = line.strip()
		self.data = {}
		self.order = []
		
		while line:
			m = re.match(r"(?P<first_name>\w+)\|(?P<last_name>\w+)",id)
			line = f.readline()
			
			if len(line) > 0:
				if line[0] == ">":
					line = line.strip()
					geneName = "N/A"
					taxa = "N/A"
					try:
						
						m = re.match(r">(?P<source>\w+)\|(?P<uniprotAcc>[\w-]+)\|(?P<uniprotId>\w+) (?P<protein>[^=]+) OS=(?P<taxa>.+) GN=(?P<geneName>.+)",id)
					
						if not m:
							m = re.match(r">(?P<source>\w+)\|(?P<uniprotAcc>[\w-]+)\|(?P<uniprotId>\w+) (?P<protein>[^=]+) OS=(?P<taxa>.+) PE",id)
							taxa = m.group("taxa")
						if not m:
							m = re.match(r">(?P<source>\w+)\|(?P<uniprotAcc>[\w-]+)\|(?P<uniprotId>\w+) (?P<protein>[^=]+)",id)
						else:
							geneName = id

						if m:
							self.order.append(m.group("uniprotAcc"))
							self.data[m.group("uniprotAcc")] = {
							"id":m.group("uniprotId"),
							"taxa":taxa,
							"gene" :geneName,
							"protein":m.group("protein"),
							"sequence":sequence,
							"header":id}
						else:
							print "Error processing fasta line"
					except:
						self.order.append(line[1:].strip())
						self.data[line[1:].strip()] = {
						"id":"N\A",
						"taxa":taxa,
						"gene" :geneName,
						"protein":line[1:],
						"sequence":sequence,
						"header":id}
							
				
					sequence = ""
					id = line.strip()
				else:
					sequence += line.strip()
			
		try:
			geneName = id
			taxa = "N/A"
			
			m = re.match(r">(?P<source>\w+)\|(?P<uniprotAcc>\w+)\|(?P<uniprotId>\w+) (?P<protein>[^=]+) OS=(?P<taxa>.+) GN=(?P<geneName>.+)",id)
		
			if not m:
				m = re.match(r">(?P<source>\w+)\|(?P<uniprotAcc>\w+)\|(?P<uniprotId>\w+) (?P<protein>[^=]+) OS=(?P<taxa>.+) PE",id)
				
			if not m:
				m = re.match(r">(?P<source>\w+)\|(?P<uniprotAcc>\w+)\|(?P<uniprotId>\w+) (?P<protein>[^=]+)",id)
				
			
		
			if m:
				self.order.append(m.group("uniprotAcc"))
				self.data[m.group("uniprotAcc")] = {
				"id":m.group("uniprotId"),
				"taxa":taxa,
				"gene" :geneName,
				"protein":m.group("protein"),
				"sequence":sequence,
				"header":id
				
				}
			else:
				print "Error processing fasta line"
		except:
			raise

		#print self.data.keys()
		
	def fetchSequence(self,accession,dataDir,refetch=False,short=False):
		try:
			if (not os.path.exists(os.path.join(dataDir,accession + ".fasta"))) or refetch:	
				url = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb/"
		
				opener = urllib.FancyURLopener()
				f = opener.open(url + accession + "/fasta")
				
				self.fasta = f.read()
				self.sequence = "".join(f.read().split("\n")[1:])
				
				open(os.path.join(dataDir, accession + ".fasta"),"w").write(self.fasta)
			else:
				self.fasta = open(os.path.join(dataDir, accession + ".fasta"),"r").read()
				self.sequence = "".join(open(os.path.join(dataDir, accession + ".fasta"),"r").read().split("\n")[1:])
		except:
			raise
			pass
			
	def removeRedundancy(self,infile,outfile="",removeFragments=True):	
		if outfile == "":
			bits = infile.split(".")
			outfile = ".".join(bits[:-1]) + ".non_redundant.fas"
			
		fastaFileHnd = fileinput.input(infile)
		
		lineCount = 0
		entry = ""
		fragment = False
		sha = []
		nonRedundentCount = 0
		
		for line in fastaFileHnd:
			
			
			lineCount += 1
			
			if lineCount == 1:
				entry += line.strip() + '\n'
				if entry.count("Fragment") > 0:
					fragment = True
					
			if lineCount == 2:
				entry += line.strip()
				
				hash = hashlib.sha224(line).hexdigest()
				if hash not in sha and not fragment:
					sha.append(hash)
	
					print entry 
					
					nonRedundentCount += 1
	
				lineCount = 0
				entry = ""
				fragment = False
		
if __name__ == "__main__":	
	if len(sys.argv) > 1:
		infile = sys.argv[1]
		fastaHelpObj = fastaHelper()
		#fastaHelpObj.removeRedundancy(infile)
		fastaHelpObj.readFastaDb(infile)
		print fastaHelpObj.data
	
	