/* -*- mode: c; coding: euc-jp -*-
 *
 * $Id: mregexp.c,v 1.0 2006/06/23 12:28:24 hirose31 Exp $
 *
 */

#include <my_global.h>
#include <m_string.h>
#include <mysql.h>

#include <oniguruma.h>

#define MREGEXP_VERSION "1.0"

#ifndef DEFAULT_ENCODING
#define DEFAULT_ENCODING ONIG_ENCODING_SJIS
#endif

#ifdef DEBUG
  #define DPRINT(s) fprintf(stderr, "%s\n", s)
#else
  #define DPRINT(s)
#endif

my_bool mregexp_init(UDF_INIT *initid,
                     UDF_ARGS *args,
                     char *message) {
  int r;
  regex_t *reg;
  OnigErrorInfo einfo;
  OnigEncoding enc;
  DPRINT(">>mregexp_init");

  if (args->arg_count != 2 && args->arg_count != 3) {
    strcpy(message, "mregexp(): requires 2 or 3 arguments. str,pat[,enc]");
    return 1;
  }
  if (args->arg_type[0] != STRING_RESULT)
    args->arg_type[0] = STRING_RESULT;
  if (args->arg_type[1] != STRING_RESULT)
    args->arg_type[1] = STRING_RESULT;
  if (args->arg_count == 3 && args->arg_type[2] != STRING_RESULT)
    args->arg_type[2] = STRING_RESULT;

  enc = DEFAULT_ENCODING;
  if (args->arg_count == 3) {
    typedef struct {
      char *key;
      OnigEncoding onig_enc;
    } enctab_type;

    static enctab_type enctab[] = {
      /* mysql character encoding name */
      { "sjis"       , ONIG_ENCODING_SJIS },
      { "cp932"      , ONIG_ENCODING_SJIS },
      { "ujis"       , ONIG_ENCODING_EUC_JP },
      { "eucjpms"    , ONIG_ENCODING_EUC_JP },
      { "utf8"       , ONIG_ENCODING_UTF8 },

      { "latin1"     , ONIG_ENCODING_ISO_8859_1 },
      { "latin2"     , ONIG_ENCODING_ISO_8859_2 },
      { "greek"      , ONIG_ENCODING_ISO_8859_7 },
      { "hebrew"     , ONIG_ENCODING_ISO_8859_8 },
      { "latin5"     , ONIG_ENCODING_ISO_8859_9 },
      { "latin7"     , ONIG_ENCODING_ISO_8859_13 },
      { "euckr"      , ONIG_ENCODING_EUC_KR },
      { "koi8r"      , ONIG_ENCODING_KOI8_R },

      /* generic character encoding name */
      { "Shift_JIS"  , ONIG_ENCODING_SJIS },
      { "EUC-JP"     , ONIG_ENCODING_EUC_JP },
      { "UTF-8"      , ONIG_ENCODING_UTF8 },

      { "UTF-16BE"   , ONIG_ENCODING_UTF16_BE },
      { "UTF-16LE"   , ONIG_ENCODING_UTF16_LE },
      { "UTF-32BE"   , ONIG_ENCODING_UTF32_BE },
      { "UTF-32LE"   , ONIG_ENCODING_UTF32_LE },

      { "ASCII"      , ONIG_ENCODING_ASCII },
      { "ISO-8859-1" , ONIG_ENCODING_ISO_8859_1 },
      { "ISO-8859-2" , ONIG_ENCODING_ISO_8859_2 },
      { "ISO-8859-3" , ONIG_ENCODING_ISO_8859_3 },
      { "ISO-8859-4" , ONIG_ENCODING_ISO_8859_4 },
      { "ISO-8859-5" , ONIG_ENCODING_ISO_8859_5 },
      { "ISO-8859-6" , ONIG_ENCODING_ISO_8859_6 },
      { "ISO-8859-7" , ONIG_ENCODING_ISO_8859_7 },
      { "ISO-8859-8" , ONIG_ENCODING_ISO_8859_8 },
      { "ISO-8859-9" , ONIG_ENCODING_ISO_8859_9 },
      { "ISO-8859-10", ONIG_ENCODING_ISO_8859_10 },
      { "ISO-8859-11", ONIG_ENCODING_ISO_8859_11 },
      { "ISO-8859-13", ONIG_ENCODING_ISO_8859_13 },
      { "ISO-8859-14", ONIG_ENCODING_ISO_8859_14 },
      { "ISO-8859-15", ONIG_ENCODING_ISO_8859_15 },
      { "ISO-8859-16", ONIG_ENCODING_ISO_8859_16 },
      { "EUC-TW"     , ONIG_ENCODING_EUC_TW },
      { "EUC-KR"     , ONIG_ENCODING_EUC_KR },
      { "EUC-CN"     , ONIG_ENCODING_EUC_CN },
      { "KOI8-R"     , ONIG_ENCODING_KOI8_R },
      { "Big5"       , ONIG_ENCODING_BIG5 },
      { "GB18030"    , ONIG_ENCODING_GB18030 },
    };
    int i;
    for (i = 0; i < (sizeof(enctab)/sizeof(enctab[0])); i++) {
      if (strncasecmp(enctab[i].key, args->args[2], args->lengths[2]) == 0) {
        enc = enctab[i].onig_enc;
        break;
      }
    }
  }

  r = onig_new(&reg, args->args[1], args->args[1] + strlen(args->args[1]),
               ONIG_OPTION_DEFAULT, enc, ONIG_SYNTAX_DEFAULT, &einfo);

  if (r != ONIG_NORMAL) { /* error */
    char s[ONIG_MAX_ERROR_MESSAGE_LEN];
    onig_error_code_to_str(s, r, &einfo);
    strcpy(message, s);
    return 1;
  }

  initid->maybe_null = 0;
  initid->ptr = (char *)reg;
  return 0;
}

longlong mregexp(UDF_INIT *initid,
                  UDF_ARGS *args,
                  char *is_null,
                  char *error) {
  int r, len;
  regex_t *reg = (regex_t *)initid->ptr;
  unsigned char *start, *range, *end;
  unsigned char str[args->lengths[0]+1];

  if (args->args[0] == NULL)
    return 0;

  strncpy(str, args->args[0], args->lengths[0]);
  str[args->lengths[0]] = '\0';

  DPRINT(">>mregexp");
  DPRINT(str);
#ifdef DEBUG
  {
    unsigned char *p;
    for (p = str; *p; p++) {
      fprintf(stderr, "%02X ", *p);
    }
    fprintf(stderr,"\n");
  }
#endif
  DPRINT(args->args[1]);
#ifdef DEBUG
  {
    unsigned char *p;
    for (p = args->args[1]; *p; p++) {
      fprintf(stderr, "%02X ", *p);
    }
    fprintf(stderr,"\n");
  }
#endif

  len = strlen(str);
  end   = str + len;
  start = str;
  range = end;

  r = onig_search(reg, str, end, start, range, NULL, ONIG_OPTION_NONE);

  if (r >= 0) {
    return 1;
  } else if (r == ONIG_MISMATCH) {
    return 0;
  } else { /* error */
    char s[ONIG_MAX_ERROR_MESSAGE_LEN];
    onig_error_code_to_str(s, r);
    fprintf(stderr, "mregexp ERROR: %s\n", s);
    *error = 1;
    return -1;
  }
}

void mregexp_deinit(UDF_INIT *initid) {
  DPRINT(">>mregexp_deinit");
  if (initid->ptr) {
    onig_free((regex_t *)initid->ptr);
    onig_end();
  }
}

char *mregexp_version(UDF_INIT *initid,
                      UDF_ARGS *args,
                      char *result,
                      unsigned long *length,
                      char *is_null,
                      char *error) {
  unsigned char* def_enc;
  if (DEFAULT_ENCODING == ONIG_ENCODING_SJIS) {
    def_enc = "Shift_JIS";
  } else if (DEFAULT_ENCODING == ONIG_ENCODING_EUC_JP) {
    def_enc = "EUC-JP";
  } else if (DEFAULT_ENCODING == ONIG_ENCODING_UTF8) {
    def_enc = "UTF-8";
  } else {
    def_enc = "other";
  }
  snprintf(result, 76, "mregexp-%s [%s] (oniguruma-%s)",
           MREGEXP_VERSION, def_enc, onig_version());
  *length = strlen(result);
  return result;
}
