import os
import glob
import math
import sys
import getopt
#for arg in sys.argv:
#    print arg
sys.path.append('../')
sys.path.append('')
import mydict

domains=open('ELM_annotated_binding_domains.txt').readlines()
anno=mydict.mydict()
for j in domains:
    j=j.split()
    for k in j[1:-1]:
        anno.append(j[0],k)

domains=open('quick_2.txt').readlines()
for j in domains:
    j=j.split()
    for k in j[1:-1]:
        anno.append(j[0],k)
wd40=['TRG_Golgi_diPhe_1','LIG_SCF-TrCP1_1','LIG_SCF_FBW7_1','TRG_ER_diArg_1','TRG_ER_diLys_1','LIG_SCF_FBW7_2','LIG_EH1_1','LIG_COP1','LIG_WRPW_1','LIG_APCC_Dbox_1','LIG_WRPW_2','LIG_GLEBS_BUB3_1','LIG_APCC_KENbox_2','TRG_PTS2','LIG_RAPTOR_TOS_1']
specific=['LIG_SCF-TrCP1_1','LIG_SCF_FBW7_1','TRG_ER_diArg_1','TRG_ER_diLys_1','LIG_SCF_FBW7_2','LIG_APCC_Dbox_1','LIG_APCC_KENbox_2','TRG_PTS2','TRG_PEX','LIG_ULM_U2AF65_1','LIG_RRM_PRI_1','LIG_TNKBM']
file='uniprot_ensembl_seqs.txt'

def identifier(domain, file='uniprot_ensembl_seqs.txt', elm=''):
    if domain == 'PFAM':
        location='pfam_hmms/'
    else:
        location='domain_all/'
    print file,'dddd',elm
    elm=elm.split('ELM=')[1]
    print elm,'fff'
    fileName359='results.txt'
    file_wr=open(fileName359,'w')
    file_wr.write('DOMAIN_ID	ELM_ID	DOMAIN_SCORE	HMM_SCORE	HMM_START	HMM_END	DOMAIN_START	DOMAIN_END	HMM_LENGTH	percentage  annotated\n')
    for fileName in glob.glob(os.path.join(location,'*.hmm')):
        edc=open(fileName)
        length_1=0
        for ki in edc:
            if 'LENG' in ki:
                ki=ki.split()
                length_1=float(ki[1])
                break
        hmm=fileName.split('/')
        hmm1=hmm[-1].split('.')
        if elm not in hmm1[0]:
            continue
        print hmm1[0]
        result=[]
        yoy=hmm1[0]
        yoy=hmm1[0].split('+')[0]
        result.append(os.popen('hmmer3.0/binaries/hmmsearch -E1 ' + fileName+' ' + file))
        n=nw=nbv=0
        for i in result[0]:#for i in result2:#[0]:
            if 'Internal pipeline statistics summary:' in i:#if 'Alignments of top-scoring domains' in i:
                break
            if 'no hits above thresholds' in i:
                print 'no hits'
                continue
            try:
                if '>>' in i:
                    uyt=nw+3
                    nbv=1
                    int_name=i[3:-1]
                    name=int_name#.split()[0][5:]
                    int_name=int_name.rstrip()
                if 'Alignments for each domain:' in i:
                    nbv=0
                if nbv>0 and '!' in i or nbv>0 and '?' in i:# == uyt:
                    x=i.split()
                    domain_score=x[5]
                    if float(x[5]) > 0.01:
                        continue
                    hmm1=hmm1[0].split('+')                
                    if hmm1 in wd40 and int(x[10])-int(x[9]) < 120:# and domain_score > 0.9:
                        continue
                    hmm2=hmm1[0]
                    j13=math.log10(float(x[5]))
                    x678=(float(length_1)-1.0767)/-1.93
                    domain_score=str(j13/x678)
                    if float(domain_score) < -1:
                        continue
                    ted=float(x[7]) - float(x[6])+1
                    if ted > float(length_1)*0.9 and ted < float(length_1)*1.1:# 2+2==4:#
                        bob=str(ted/float(length_1))
                        try:
                            tag=0
                            for g in anno[hmm1[0]]:
                                g=g.split('+')
                                if g[0] in name:
                                    if int(g[1]) in range(int(x[9]),int(x[10])) or int(g[2]) in range(int(x[9]),int(x[10])) or int(x[9]) in range(int(g[1]),int(g[2])) or int(x[10]) in range(int(g[1]),int(g[2])):
                                        tag=1
                            if tag == 1:
                                file_wr.write(name+'\t'+yoy+'\t'+str(domain_score)+'\t'+str(x[5])+'\t'+str(x[6])+'\t'+str(x[7])+'\t'+str(x[9])+'\t'+str(x[10])+'\t'+str(length_1)+'\t'+bob+'\t'+'annotated'+'\n')
                            elif yoy in specific:
                                if yoy not in wd40 and float(bob) > 0.98:
                                    file_wr.write(name+'\t'+yoy+'\t'+str(domain_score)+'\t'+str(x[5])+'\t'+str(x[6])+'\t'+str(x[7])+'\t'+str(x[9])+'\t'+str(x[10])+'\t'+str(length_1)+'\t'+bob+'\n')
                                else:
                                    continue
                            else:
                                file_wr.write(name+'\t'+yoy+'\t'+str(domain_score)+'\t'+str(x[5])+'\t'+str(x[6])+'\t'+str(x[7])+'\t'+str(x[9])+'\t'+str(x[10])+'\t'+str(length_1)+'\t'+bob+'\n')
                        except KeyError:
                            #continue
                            file_wr.write(name+'\t'+yoy+'\t'+str(domain_score)+'\t'+str(x[5])+'\t'+str(x[6])+'\t'+str(x[7])+'\t'+str(x[9])+'\t'+str(x[10])+'\t'+str(length_1)+'\t'+bob+'\n')
            except IndexError:
                continue
        

    file_wr.close()
    file2=open(fileName359).readlines()
    found_already=mydict.mydict()
    keep={}
    for j in file2[1:]:
        tag=0
        j1=j.split()
        if 'MOD_' in j and float(j1[2]) < 0.8 and 'MOD_TYR_I' not in j:
            continue
        if 'DOMAIN_START' in j:
            continue
        j4=j1[0].split('|')
        start=int(j1[6])
        end=int(j1[7])
        elm1=j1[1].split('+')
        elm=elm1[0]
    
        for kl in file2:
            kl1=kl.split()
            kl2=kl1[0].split('|')
            elm2=kl1[1].split('+')[0]
            if elm == elm2 and kl2[-1] == j4[-1]:
                if start in range(int(kl1[6]),int(kl1[7])) or end in range(int(kl1[6]),int(kl1[7])) or int(kl1[6]) in range(start,end) or int(kl1[7]) in range(start,end):
                    if float(j1[3]) > float(kl1[3]):
                        tag = 1
                        break
        if tag == 0:
            keep[j]=j

    complete={}
    for j in keep:
        j1=j.split()
        j2=j1[0].split('|')
        temp=j2[-1]
        for k in j1[1:]:
            temp+=k+'\t'
        temp+='\n'
        complete[temp]=j
    
    file_wr=open(fileName359,'w')
    file_wr.write(file2[0])
    for j in complete:
        if file2[0] in j:
            continue
        file_wr.write(complete[j])
 
    file_wr.close()
    return 'please find results in "results.txt"'


def usage():
    print '###############################'
    print 'this program will search ELM binding proteins.'
    print 'results are to be found in result.txt file.  '
    print 'please input:'
    print '1) your choice: "PFAM" or "HMM"  '
    print '           PFAM - those HMMs trained by Pfam'
    print '           HMM - those HMMs trained on SLiM-binding domains'
    print '2) domain file to search within (default all human proteome)'
    print '###############################'    
    sys.exit()
    
    
    
def main(argv):
    try:
        opts,args=getopt.getopt(argv,'hg:d',['help','grammar='])
    except GetoptError:
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-h','--help'):
            usage()
            sys.exit()
        elif opt == ('-g','--grammar'):
            grammar=arg
    #print sys.argv
    if len(sys.argv) == 2:
        qaz=identifier(sys.argv[1])
    elif len(sys.argv) == 3:
        if 'ELM=' in sys.argv[2]:
            print 'hello'
            qaz=identifier(sys.argv[1],'uniprot_ensembl_seqs.txt',sys.argv[2])
        elif '.' not in sys.argv[2]:
            print '###############################'
            print 'if specifying an ELM class to search'
            print 'please put "ELM=" before name'
            print 'please ensure you use extension for file name e.g .txt'
            print '##############################'
        else:
            qaz=identifier(sys.argv[1],sys.argv[2])
    elif len(sys.argv) == 4:
        qaz=identifier(sys.argv[1],sys.argv[2],sys.argv[3])
    else:
        print '###############################'
        print 'please input:'
        print '1) your choice: "PFAM" or "HMM"  '
        print '           PFAM - those HMMs trained by Pfam'
        print '           HMM - those HMMs trained on SLiM-binding domains'
        print '2) domain file to search within (default all human proteome)'
        print '            sequence files must be in FASTA format'
        print '3) if you wish to only search domains from one ELM class please put:'
        print '        ELM=<ELM_CLASS>  '
        print '        e.g. ELM=LIG_SH3_1'
        print '###############################'
        sys.exit()

if __name__ == '__main__':
    main(sys.argv[1:])