/*   srebase.c
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE                          
*               National Center for Biotechnology Information
*                                                                          
*  This software/database is a "United States Government Work" under the   
*  terms of the United States Copyright Act.  It was written as part of    
*  the author's official duties as a United States Government employee and 
*  thus cannot be copyrighted.  This software/database is freely available 
*  to the public for use. The National Library of Medicine and the U.S.    
*  Government have not placed any restriction on its use or reproduction.  
*                                                                          
*  Although all reasonable efforts have been taken to ensure the accuracy  
*  and reliability of the software and data, the NLM and the U.S.          
*  Government do not and cannot warrant the performance or results that    
*  may be obtained by using this software or data. The NLM and the U.S.    
*  Government disclaim all warranties, express or implied, including       
*  warranties of performance, merchantability or fitness for any particular
*  purpose.                                                                
*                                                                          
*  Please cite the author in any work or product based on this material.   
*
* ===========================================================================
*
* File Name:  srebase.c
*
* Author:  Epstein
*
* Version Creation Date:   1/4/94
*
* $Revision: 6.0 $
*
* File Description: 
*       Find restriction sites using REBASE database
*
* Modifications:  
* --------------------------------------------------------------------------
* Date     Name        Description of modification
* -------  ----------  -----------------------------------------------------
*
* $Log: srebase.c,v $
* Revision 6.0  1997/08/25 18:20:44  madden
* Revision changed to 6.0
*
* Revision 1.2  1996/06/21 14:10:34  epstein
* add boilerplate and run 'indent'
*
*
* ==========================================================================
*/

#include <ncbi.h>
#include <accentr.h>
#include <sequtil.h>
#include <spattern.h>

static CutsiteInfoPtr 
ParseCutSites (CharPtr filename)
{
    FILE           *fp;

    Boolean counting = FALSE;
    Int4 count = 0;
    Int4            len;

    char            s[100];

    CutsiteInfoPtr  csip, csip2;

    fp = FileOpen (filename, "r");

    while (FileGets (s, sizeof (s), fp) != NULL) {
	if (s[0] == 'C')
	    counting = TRUE;
	if (counting) {
	    count++;
	}
    }

    count++;
    rewind (fp);
    csip2 = MemNew (sizeof (*csip) * count);
    csip = csip2;
    counting = FALSE;
    while (FileGets (s, sizeof (s), fp) != NULL) {
	if (s[0] == 'C')
	    counting = TRUE;
	if (counting) {
	    len = strchr (&s[3], ' ') - &s[3];
	    csip->enzymeName = MemNew (len + 1);
	    StrNCpy (csip->enzymeName, &s[3], len);
	    csip->enzymeName[len] = '\0';
	    len = strchr (&s[22], ' ') - &s[22];
	    csip->recognitionSeq = MemNew (len + 1);
	    StrNCpy (csip->recognitionSeq, &s[22], len);
	    csip->recognitionSeq[len] = '\0';
	    csip++;
	}
    }

    csip->enzymeName = NULL;	/* EOF marker */
    FileClose (fp);

    return csip2;
}

/* find the last nucleotide bioseq in the bioseqset */
static void 
FindNuc (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
{
    BioseqPtr PNTR  bp;

    BioseqPtr       local_bsp;

    bp = (BioseqPtr PNTR) data;
    if (IS_Bioseq (sep)) {
	local_bsp = (BioseqPtr) sep->data.ptrvalue;
	if (ISA_na (local_bsp->mol))
	    *bp = local_bsp;
    }
}

/* find the last protein bioseq in the bioseqset */
static void 
FindProt (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
{
    BioseqPtr PNTR  bp;

    BioseqPtr       local_bsp;

    bp = (BioseqPtr PNTR) data;
    if (IS_Bioseq (sep)) {
	local_bsp = (BioseqPtr) sep->data.ptrvalue;
	if (ISA_aa (local_bsp->mol))
	    *bp = local_bsp;
    }
}

#define NUMARGS 4
Args            myargs[NUMARGS] = {
    {"Gi number", NULL, "1", "99999999", FALSE, 'g', ARG_INT, 0.0, 0, NULL},
    {"REBASE restriction enzyme file", "stdin", "NULL", "NULL", FALSE, 'e', ARG_FILE_IN, 0.0, 0, NULL},
    {"Output file", "stdout", "NULL", "NULL", FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
    {"Input is protein", "F", "NULL", "NULL", FALSE, 'p', ARG_BOOLEAN, 0.0, 0, NULL}
};

Int2 Main (void)
{
    SeqLocPtr       slp;

    CutsiteInfoPtr  enzymes;

    SeqAnnotPtr     annot;

    SeqEntryPtr     sep;

    BioseqPtr       bsp = NULL;

    Boolean         isprot = FALSE;

    AsnIoPtr        aip;

    Int4            gi;

    if (!GetArgs ("Srebase $Revision: 6.0 $", NUMARGS, myargs))
	return 1;
    gi = myargs[0].intvalue;
    if (gi <= 0) {
	Message (MSG_OK, "Invalid input GI %ld", (long) gi);
	return 1;
    }
    enzymes = ParseCutSites (myargs[1].strvalue);
    if (enzymes == NULL) {
	Message (MSG_OK, "Parsing of %s failed", myargs[1].strvalue);
	return 2;
    }
    if (!EntrezInit ("search-REBASE", FALSE, NULL)) {
	Message (MSG_OK, "EntrezInit failed");
	return 3;
    }
    isprot = myargs[3].intvalue;

    sep = EntrezSeqEntryGet (gi, 0);
    if (sep == NULL) {
	Message (MSG_OK, "Unable to find SeqEntrez for %s GI %ld", isprot ? "protein" : "nucleotide", (long) gi);
	EntrezFini ();
	return 3;
    }
    SeqEntryExplore (sep, &bsp, isprot ? FindProt : FindNuc);
    if (bsp == NULL) {
	Message (MSG_OK, "Unable to find suitable %s bioseq", isprot ? "protein" : "nucleotide");
	EntrezFini ();
	return 3;
    }
    slp = SeqLocIntNew (0, bsp->length - 1, Seq_strand_plus, SeqIdFindBest (bsp->id, SEQID_GI));
    annot = FindCutSitesAsSeqAnnot (slp, enzymes, "REBASE 606", 1);
    SeqLocFree (slp);
    aip = AsnIoOpen (myargs[2].strvalue, "w");
    SeqAnnotAsnWrite (annot, aip, NULL);
    SeqAnnotFree (annot);
    AsnIoClose (aip);
    SeqEntryFree (sep);
    EntrezFini ();

    return 0;
}
