# -*- coding: utf-8 -*-
"""
Created on Wed Aug  5 09:42:29 2015

@author: rosikiewicz
"""
import urllib2


def downloadTissueType(accnum):
	try:
		url="http://www.ncbi.nlm.nih.gov/sra/{}".format(accnum)
		page = urllib2.urlopen(url)
		data=page.read()
		#print data
		
		tmp1 = data.split('tissue_type: </label>')
		tmp2 = tmp1[1].split('<br />')
	#	print tmp2[0]
		return tmp2[0]
	except IndexError:
		return "--- IndexError: list index out of range"

def downloadTissueType2(accnum):
	url="http://www.ncbi.nlm.nih.gov/sra/{}".format(accnum)
	page = urllib2.urlopen(url)
	data=page.read()
	#print data
	
	step1 = data.split('</a></span></div><div>Attributes: <div><span><label class="tag">')
	step2 = step1[1].split('</span></div></div></div></div><div class="expand e-hidden sra-full-data">')
	step3 = step2[0].split('<br /></span><span><label class="tag">')
	result = {}
	for elem in step3:
		step4 = elem.split(': </label>')
		result[step4[0]] = step4[1]
#	print tmp2[0]
	return result
		
def parseLibFile(infileName):
	infile = open(infileName, 'r')
	result = {} #{species : [list, of, accNums]}
	for row in infile:
		if row[:1] != "#":
#			print row
			tmpRow = row.replace('\n','').split('\t')
			spec = tmpRow[0]
			tmpLibs = tmpRow[1].split('-')
			libs = []
			for lib in tmpLibs:
				libs.append(lib)
			result[spec] = libs
	infile.close()
	return result

def dataCollector(libs):
	result = {} #{SRR_accnum : {attributeKeys : value}}
	lib2spec = {}
	count = 1
	
	for spec in libs:
		for lib in libs[spec]:
			lib2spec[lib] = spec
			try:
				result[lib] = downloadTissueType2(lib)
				print "{}. {}".format(count, lib)
			except IndexError:
				print "{}. {} - IndexError".format(count, lib)
#			print spec, lib, downloadTissueType(lib)
			
			count += 1
	return result, lib2spec
	
libs = parseLibFile('librariesList.txt')
#data, lib2spec = dataCollector(libs) # uncomment this line before first run (in case of SPYDER, which saves data and lib2spec values in memory after first usage;)

allKeys = {}
for acc in data:
	for key in data[acc]:
		if key in allKeys:
			allKeys[key] += 1
		else:
			allKeys[key] = 1

outfile = open('libAttributes.txt','w')

outfile.write("lib\tspec")
for key in allKeys:
	outfile.write("\t{} ({})".format(key, allKeys[key]))
outfile.write("\n")
for lib in lib2spec:
	outfile.write("{}\t{}".format(lib, lib2spec[lib]))
	if lib in data:
		for key in allKeys:
			if key in data[lib]:
				outfile.write("\t{}".format(data[lib][key]))
			else:
				outfile.write("\t")
	else:
		outfile.write("\tIndexError")
	outfile.write("\n")
outfile.close()
		

#1st: '</div><div>Attributes: <div><span><label class="tag">'
#2nd: '</span></div></div></div></div><div class="expand e-hidden sra-full-data">'
#3rd: '<br /></span><span><label class="tag">'
#4th: ': </label>'

#test = downloadTissueType2('SRR639164')

#for spec in libs:
#	for lib in libs[spec]:
#		print spec, lib, downloadTissueType(lib)

