/* Copyright (C) 2003,2004 MySQL AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
#include <glib.h>
#include <iconv.h>
#include "myx_xml_aux_functions.h"
#include "myx_library.h"

#define BUFFER_LEN 1024*10
#define MAX_BYTES_PER_CHARACTER 6
#define DETECT_CHARSET_FROM_SQL_FILE_BUFFER_SIZE 4096

///////////////////////////////////////////////////////////////////////////////
/** @brief International File.
*//////////////////////////////////////////////////////////////////////////////
struct myx_intl_file
{
  FILE *file;           // the file descriptor for filename
  const char *filename; // the file we are reading from
  const char *charset;  // the encoding of the file
  gboolean charset_is_utf8; // we have this to avoid costly string comparisons
                            // for every call of myx_read_char_from_intl_file
  iconv_t conv_to_utf8, conv_from_utf8;

  // untranslated line
  unsigned char buffer[BUFFER_LEN + MAX_BYTES_PER_CHARACTER]; 
  size_t buffer_len_in_bytes;

  gsize bytes_translated; // the number of bytes that really got translated
                          //   from buffer to utf8_buffer

  gchar *utf8_buffer;    // translated line, may not be null-terminated
  size_t utf8_buffer_len_in_bytes;
  gchar *next_utf8_char; // points to the character that is going to be
                         //   returned on the next call to 
                         //   myx_read_char_from_file
  bigint file_pos; // the number of bytes that have already been
                   // read from this file
};

///////////////////////////////////////////////////////////////////////////////
/** @brief Tries to detect the charset that an sql-file is written in.
    @param filename path to the sql-file
    @return NULL or a string (allocated by glib) describing the charset
    It works by searching for a SET NAMES charset (or similiar) statement.
    This should give a 100% success rate for 4.1 and newer sql-files.
*//////////////////////////////////////////////////////////////////////////////
#define O_VECTOR_COUNT 30
char *myx_detect_charset_from_sql_file(const char* filename)
{
  char buf[DETECT_CHARSET_FROM_SQL_FILE_BUFFER_SIZE];
  const char * rexp1= "SET\\s+NAMES\\s+(\\w+)";
  const char * rexp2= "(?:SET.*(?:,|\\s)CHARACTER_SET_CLIENT\\s*=\\s*(\\w+))";
  pcre *rstmt1, *rstmt2;
  int i;
  const char *errs;
  int errof;
  char *file_charset= NULL;
  FILE *file;

  if (!(rstmt1= pcre_compile(rexp1,PCRE_CASELESS,&errs,&errof,NULL)))
    return file_charset;
  if (!(rstmt2= pcre_compile(rexp2,PCRE_CASELESS,&errs,&errof,NULL)))
    goto exit3;
  if ( !(file= myx_fopen(filename, "r")) )
    goto exit2;

  //only check the first 15 lines
  for (i= 0; i < 15 && fgets(buf,sizeof(buf), file);i++)
  {
    int rc;
    int o_v[O_VECTOR_COUNT];
    size_t blen= strlen(buf);

    /* analyze this line */
    
    if ((rc= pcre_exec(rstmt1,NULL,buf,(int)blen,0,0,o_v,sizeof(o_v))) > 0 ||
        (rc= pcre_exec(rstmt2,NULL,buf,(int)blen,0,0,o_v,sizeof(o_v))) > 0)
    {
      const char *charset;
      assert(rc == 2); //2 matches

      if ( pcre_get_substring(buf, o_v, rc, 1, &charset) < 0)
        goto exit1;

      file_charset= iconv_char_name(charset);
      pcre_free_substring(charset);
      file_charset= !g_utf8_collate(file_charset, "-")
                    ? NULL //this means that there is no appropriate iconv-name
                    : g_strdup(file_charset);
      goto exit1;
    } 
  }

exit1: fclose(file);
exit2: pcre_free(rstmt2);
exit3: pcre_free(rstmt1);
  return file_charset;
}

///////////////////////////////////////////////////////////////////////////////
/** @brief Creates an international file
    @param filename path to file to open
    @param charset name of character set for file
    @param error pointer to error code (it is set if the function returns NULL)
    @return NULL or created MYX_INTL_FILE struct
*//////////////////////////////////////////////////////////////////////////////
MYX_INTL_FILE * myx_new_intl_file(const char *filename,
                                  const char *charset,
                                  MYX_LIB_ERROR *error)
{
  FILE *fh= myx_fopen(filename, "r");
  if (!fh)
  {
    *error= MYX_ERROR_CANT_OPEN_FILE;
    return NULL;
  }
  else
  {
    MYX_INTL_FILE *file= g_malloc0(sizeof(MYX_INTL_FILE));

    file->filename= filename;
    file->charset= charset;
    file->file= fh;
    file->charset_is_utf8=  ( !g_utf8_casecollate(file->charset, "utf8") ||
                              !g_utf8_casecollate(file->charset, "UTF-8"));
    if (!file->charset_is_utf8)
    {
      file->conv_to_utf8=   iconv_open("UTF-8",file->charset);
      file->conv_from_utf8= iconv_open(file->charset,"UTF-8");
    }

    /*
      since utf8_buffer_len_in_bytes is 0, this will cause
      my_read_char_from_intl_file to believe that a new line has to be read in
    */
    file->next_utf8_char= file->utf8_buffer;
    return file;
  }
}

///////////////////////////////////////////////////////////////////////////////
/** @brief free memory for the International File
    @param file pointer to struct with International File to free (may be null)
    @return 0 always
*//////////////////////////////////////////////////////////////////////////////
void myx_free_intl_file(MYX_INTL_FILE *file)
{
  if (file)
  {
    g_free(file->utf8_buffer);
    if (file->file)
      fclose(file->file);
    g_free(file);
    if (!file->charset_is_utf8)
    {
      iconv_close(file->conv_to_utf8);
      iconv_close(file->conv_from_utf8);
    }
  }
}

size_t safe_copy_untranslated_characters(MYX_INTL_FILE *file)
{
  if (file->buffer_len_in_bytes <= file->bytes_translated)
  {
    return 0;
  }
  else // this means that we have a partial character at the end
  {
    char * dst= file->buffer;
    char * src= file->buffer + file->bytes_translated;
    char * end= src + file->buffer_len_in_bytes;
    for (; src!=end; src++, dst++)
      *dst= *src;
    return file->buffer_len_in_bytes - file->bytes_translated;
  }
}

int wrong_invalid_chars_at_the_end(MYX_INTL_FILE *file,
                                   unsigned char *invalid_chars)
{
  gssize wrong_size=
          (gssize)(file->buffer_len_in_bytes - file->utf8_buffer_len_in_bytes);
  if (wrong_size < MAX_BYTES_PER_CHARACTER &&
      g_utf8_get_char_validated(invalid_chars,wrong_size)== (gunichar)-2)
  {
    return 0;
  }
  else
  {
    assert(g_utf8_get_char_validated(invalid_chars,wrong_size) ==(gunichar)-1);
    return 1;
  }
}

int translate_utf8_buffer(MYX_INTL_FILE * file, MYX_LIB_ERROR * error)
{
  unsigned char *invalid_chars= 0;
  if ( g_utf8_validate(file->buffer,
                        (gssize)file->buffer_len_in_bytes,
                        (const gchar**)&invalid_chars) )
  {
    file->utf8_buffer= g_memdup(file->buffer,
                                            (gssize)file->buffer_len_in_bytes);
    file->utf8_buffer_len_in_bytes= file->buffer_len_in_bytes;
  }
  else // check if we have non utf8 chars or 
  {    //  if our buffer simply ends with a partial character 
    file->utf8_buffer_len_in_bytes= invalid_chars - file->buffer;
    file->utf8_buffer= g_memdup(file->buffer,
                                (guint)file->utf8_buffer_len_in_bytes);

    if (wrong_invalid_chars_at_the_end(file,invalid_chars))
    { // that's not a partial char at the end
      *error= MYX_CHARSET_WRONG_CHARSET_SPECIFIED; 
      return 0;
    }
  } // since the source is in UTF-8
  file->bytes_translated= (gssize)file->utf8_buffer_len_in_bytes;
  return 1;
}

int translate_non_utf8_buffer(MYX_INTL_FILE * file, MYX_LIB_ERROR * error)
{
  GError *err= NULL;
  file->utf8_buffer= g_convert(file->buffer, (gssize)file->buffer_len_in_bytes,
                                   "UTF-8", file->charset,
                                   &file->bytes_translated,
                                   &file->utf8_buffer_len_in_bytes, &err);
  if ( !file->utf8_buffer || 
        (err && err->code != G_CONVERT_ERROR_PARTIAL_INPUT))
  {
    *error= MYX_CHARSET_CONVERSION_ERROR;
    return 0;
  }
  return 1;
}

int safe_read_buffer(MYX_INTL_FILE * file, MYX_LIB_ERROR * error)
{
  if (file->next_utf8_char != file->utf8_buffer +
                              file->utf8_buffer_len_in_bytes)
  {
    return 1;
  }
  else
  { // we need to refill our buffer    
    size_t untranslated= safe_copy_untranslated_characters(file);
    size_t read_len;
    
    g_free(file->utf8_buffer); // necessary so that we don't free it again
    file->utf8_buffer= NULL;   //   in myx_free_intl_file
    
    if ( !(read_len= fread(file->buffer + untranslated,
                           1, BUFFER_LEN, file->file)) )
    {
      if (ferror(file->file))
        *error= MYX_CANT_READ_FROM_FILE;
      return 0;
    }
    file->buffer_len_in_bytes= read_len + untranslated;

    if (!( file->charset_is_utf8 ? translate_utf8_buffer(file,error)
                                 : translate_non_utf8_buffer(file,error)))
    {
      return 0;
    }
    file->next_utf8_char= file->utf8_buffer;
    return 1;
  }
}

///////////////////////////////////////////////////////////////////////////////
/** @brief Reads one character from the file 

    @param file       International File described the read file (with charset)
    @param bytes_read If bytes_read is not NULL it will be set to number 
                      of bytes read
    @param error      error pointer to error code
                      (it is set if the function returns -1)

    @return The index of the character in the Unicode-set
            -1 in case of an error or end-of-file
*//////////////////////////////////////////////////////////////////////////////
bigint myx_read_char_from_intl_file(MYX_INTL_FILE * file,
                                    int * bytes_read, MYX_LIB_ERROR * error)
{
  if (!safe_read_buffer(file,error))
  {
    return -1;
  }
  else
  {
    gunichar utf8_char= g_utf8_get_char(file->next_utf8_char);
    char * cur_utf8_char= file->next_utf8_char;
    file->next_utf8_char= g_utf8_next_char(file->next_utf8_char);

    *bytes_read= (int) (file->next_utf8_char - cur_utf8_char);

    // Let's calculate real number of bytes read
    if ( !file->charset_is_utf8 )
    {
      char buf[20];
      char *buf_pos= buf;
      size_t buf_left= sizeof(buf);
      size_t chars_left= *bytes_read;
      /*size_t res=*/ iconv(file->conv_from_utf8,
                        &cur_utf8_char,&chars_left,
                        &buf_pos,&buf_left);
      *bytes_read= (int)(sizeof(buf) - buf_left);
    }
    file->file_pos += *bytes_read;  

    return utf8_char;
  }
}
