/* 
 *  IMS Open Corpus Workbench (CWB)
 *  Copyright (C) 1993-2006 by IMS, University of Stuttgart
 *  Copyright (C) 2007-     by the respective contributers (see file AUTHORS)
 * 
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2, or (at your option) any later
 *  version.
 * 
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
 *  Public License for more details (in the file "COPYING", or available via
 *  WWW at http://www.gnu.org/copyleft/gpl.html).
 */
/*
 *  Windows/Unicode-compatibility extensions to CWB in this file
 *  Copyright (C) 2010      by ANR Textométrie, ENS de Lyon
 */

void Rprintf(const char *, ...);

#include <ctype.h>

#include <glib.h>

#include "globals.h"

#include "special-chars.h"



/* ---------------------------------------------------- */
/* composite tables that can automatically be generated */
/* ---------------------------------------------------- */

/**
 * Array of mapping tables used when NEITHER case NOR diacritics are to be stripped.
 *
 * These are composite tables: they are only generated when needed (the corresponding
 * identity_tab_init value is a boolean indicating whether this has been done yet).
 *
 * Use a CorpusCharset value as the index into this array.
 */
unsigned char identity_tab[unknown_charset][256];
int identity_tab_init[unknown_charset] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

/**
 * Array of mapping tables used when BOTH case AND diacritics are to be stripped.
 *
 * These are composite tables: they are only generated when needed (the corresponding
 * identity_tab_init value is a boolean indicating whether this has been done yet).
 *
 * Use a CorpusCharset value as the index into this array.
 */
unsigned char nocase_nodiac_tab[unknown_charset][256];
int nocase_nodiac_tab_init[unknown_charset] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};


/* ------------------------------------------------------------------------------ */
/* tables that are created at compile time, contain mapping data for all charsets */
/* ------------------------------------------------------------------------------ */

/**
 * Array of tables mapping a character (the index) to the
 * equivalent character without any accents (the value).
 *
 * There are as many tables as there are possible values of
 * CorpusCharset. Moreover, tables must always be in the
 * same order as the values of CorpusCharset are declared in.
 *
 * This means starting at ascii == 0 and working up through the
 * canonical order that is observable in cl.h
 *
 * Use a CorpusCharset value as the index into this array.
 *
 * @see CorpusCharset
 */
unsigned char nodiac_tab[unknown_charset][256] = {

    /* ASCII: identity as there are no accented chars less than 0x80;
     * any bytes in the upper half are malformed, so just pass them through;
     * table only needed to make the array work correctly */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,192,193,194,195,196,197,198,199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin1 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
           65, 65, 65, 65, 65, 65, 65, 67, 69, /* uppercase */
       69, 69, 69, 73, 73, 73, 73, 68, 78, 79,
       79, 79, 79, 79,215, 79, 85, 85, 85, 85,
       89, 84,115,                  /* thorn -> 'T', szlig -> 's' */
                   97, 97, 97, 97, 97, 97, 97, /* lowercase */
       99,101,101,101,101,105,105,105,105,100,
      110,111,111,111,111,111,247,111,117,117,
      117,117,121,116,121
    },
    /* latin2 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,

       65,162, 76,164, 76, 83,167,168, 83, 83, /* uppercase, first round (interspersed with non-letters) */
       84, 90,173, 90, 90,176,
                               97,178,108,180, /* lowercase, first round (interspersed with non-letters) */
      108,115,183,184,115,115,116,122,189,122,
      122,
           82, 65, 65, 65, 65, 76, 67, 67, 67, /* uppercase, main block */
       69, 69, 69, 69, 73, 73, 68, 68, 78, 78,
       79, 79, 79, 79,215, 82, 85, 85, 85, 85,
       89, 84,115,                             /* szlig -> 's' for Latin1 compatability */
                  114, 97, 97, 97, 97,108, 99, /* lowercase, main block */
       99, 99,101,101,101,101,105,105,100,100,
      110,110,111,111,111,111,247,114,117,117,
      117,117,121,116,255
    },
    /* latin3 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* uppercase, first round (interspersed with non-letters) */
       72,162,163,164,165, 72,167,168, 73, 83,
       71, 74,173,174, 90,
                          176,104,178,179,180, /* lowercase, first round (interspersed with non-letters) */
      181,104,183,184,105,115,103,106,189,190,
      122,
           65, 65, 65,195, 65, 67, 67, 67, 69, /* uppercase, main block */
       69, 69, 69, 73, 73, 73, 73,208, 78, 79,
       79, 79, 71, 79,215, 71, 85, 85, 85, 85,
       85, 83,115,                             /* szlig -> 's' for Latin1 compatability */
                   97, 97, 97,227, 97, 99, 99, /* uppercase, main block */
       99,101,101,101,101,105,105,105,105,240,
      110,111,111,111,103,111,247,103,117,117,
      117,117,117,115,255
    },
    /* latin4 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* uppercase, first round (interspersed with non-letters) */
       65,113, 82,164, 73, 76,167,168, 83, 69, /* note lowercase "kra" is mapped to "q" because it's written as such nowadays */
       71, 84,173, 90,175,
                          176, 97,178,114,180, /* lowercase, first round (interspersed with non-letters) */
      105,108,183,184,115,101,103,116, 78,122,
      110,
           65, 65, 65, 65, 65, 65, 65, 73, 67, /* uppercase, main block */
       69, 69, 69, 69, 73, 73, 73, 68, 78, 79,
       75, 79, 79, 79,215, 79, 85, 85, 85, 85,
       85, 85,115,                             /* szlig -> 's' for Latin1 compatability */
                   97, 97, 97, 97, 97, 97, 97, /* uppercase, main block */
      105, 99,101,101,101,101,105,105,105,100,
      110,111,107,111,111,111,247,111,117,117,
      117,117,117,117,255
    },
    /* cyrillic */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      /* this is where the Cyrillic block actually begins. Note, only
       * characters given a decomposition with triple-equals in the
       * Unicode standard are mapped. */
      181,162,179,164,165,166,166,168,169,170,
      171,186,173,195,175,176,177,178,179,180,
      181,182,183,184,184,186,187,188,189,190,
      191,192,193,194,195,196,197,198,199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,216,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      213,242,211,244,245,246,246,248,249,250,
      251,218,253,227,255
    },
    /* arabic */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,192,
      /* letters with madda/hamza are decomposed in Unicode standard */
              193,199,199,232,199,234,
      /* rest of Arabic stays as it is. */
                                      199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* greek */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,
          193,183,197,199,201,187,188,207,213, /* misc upper/lower */
      217,233,
              193,194,195,196,197,198,199,200, /* uppercase */
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,201,213,
                                          225, /* lowercase */
      229,231,233,245,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,233,
      245,239,245,249,255
    },
    /* hebrew : no accented characters */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,192,193,194,195,196,197,198,199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin5 (8859-9): same as latin1 except for d0,dd,de,f0,fd,fe
     * (which are all accented characters) */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
           65, 65, 65, 65, 65, 65, 65, 67, 69, /* uppercase */
       69, 69, 69, 73, 73, 73, 73,
                                   71,         /* != latin1 */
                                       78, 79,
       79, 79, 79, 79,215, 79, 85, 85, 85, 85,
       73, 83,                                 /* != latin1 */
              115,                             /* szlig -> 's' */
                   97, 97, 97, 97, 97, 97, 97, /* lowercase */
       99,101,101,101,101,105,105,105,105,
                                          103, /* != latin1 */
      110,111,111,111,111,111,247,111,117,117,
      117,117,
              105,115,                         /* != latin1 */
                      121
    },
    /* latin6 (8859-10) */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* uppercase small block */
       65, 69, 71, 73, 73, 75,167, 76, 68, 83,
       84, 90,173, 85, 78,
                          176, 97,101,103,105, /* lowercase small block */
      105,107,183,108,100,115,116,122,189,117,
      110,
           65, 65, 65, 65, 65, 65, 65, 73, 67, /* uppercase main block */
       69, 69, 69, 69, 73, 73, 73, 68, 78, 79,
       79, 79, 79, 79, 85, 79, 85, 85, 85, 85,
       89, 84,115,                             /* thorn -> 'T', szlig -> 's' */
                   97, 97, 97, 97, 97, 97, 97, /* lowercase main block */
      105, 99,101,101,101,101,105,105,105,100,
      110,111,111,111,111,111,117,111,117,117,
      117,117,121,116,113                      /* note lowercase "kra" is mapped to "q" because it's written as such nowadays */
    },
    /* latin7 (8859-13) */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* uppercase small block */
      161,162,163,164,165,166,167, 79,169, 82,
      171,172,173,174, 65,
                          176,177,178,179,180, /* lowercase small block */
      181,182,183,111,185,114,187,188,189,190,
       97,
           65, 73, 65, 67, 65, 65, 69, 69, 67, /* uppercase main block */
       69, 90, 69, 71, 75, 73, 76, 83, 78, 78,
       79, 79, 79, 79,215, 85, 76, 83, 85, 85,
      90,90,115,             /* szlig -> s for Latin1 compatability */
                   97,105, 97, 99, 97, 97,101, /* lowercase main block */
      101, 99,101,122,101,103,107,105,108,115,
      110,110,111,111,111,111,247,117,108,115,
      117,117,122,122,255
    },
    /* latin8 (8859-14): differs from Latin1
     * in 160-191; 208,215,222; 240,247,254 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* mixed accented */
       66, 98,163, 67, 99, 68,167, 87,169,119,
      100, 89,173,174, 89, 70,102, 71,103, 77,
      109,182, 80,119,112,119, 83,121, 87,119,
       83,
           65, 65, 65, 65, 65, 65, 65, 67, 69, /* uppercase */
       69, 69, 69, 73, 73, 73, 73, 87, 78, 79,
       79, 79, 79, 79, 84, 79, 85, 85, 85, 85,
       89, 89,115,                  /* thorn -> 'T', szlig -> 's' */
                   97, 97, 97, 97, 97, 97, 97, /* lowercase */
       99,101,101,101,101,105,105,105,105,119,
      110,111,111,111,111,111,116,111,117,117,
      117,117,121,121,121
    },
    /* latin9 (8859-15): almost the same as Latin1, the
     * differences are a4,a6,a8,b4,b8,bc,bd,be. */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
       81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96, 97, 98, 99,100,
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* block containing the 7 chars */
      161,162,163,164,165, 83,167,115,169,170, /* that differ from Latin1 */
      171,172,173,174,175,176,177,178,179, 90,
      181,182,183,122,185,186,187, 79,111, 89,
      191,
           65, 65, 65, 65, 65, 65, 65, 67, 69, /* uppercase */
       69, 69, 69, 73, 73, 73, 73, 68, 78, 79,
       79, 79, 79, 79,215, 79, 85, 85, 85, 85,
       89, 84,115,                  /* thorn -> 'T', szlig -> 's' */
                   97, 97, 97, 97, 97, 97, 97, /* lowercase */
       99,101,101,101,101,105,105,105,105,100,
      110,111,111,111,111,111,247,111,117,117,
      117,117,121,116,121
    },

    /* UTF8: a dummy table which should never be used. The big wall
     * of zeroes should function as a reminder of this! */
    {
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    }
}; /* end initialise: nodiac tables */

/**
 * Array of tables mapping a character (the index) to the
 * equivalent character in lowercase (the value).
 *
 * There are as many tables as there are possible values of
 * CorpusCharset. Moreover, tables must always be in the
 * same order as the values of CorpusCharset are declared in.
 *
 * This means starting at ascii == 0 and working up through the canonical
 * order that is observable in cl.h
 *
 * Use a CorpusCharset value as the index into this array.
 *
 * @see CorpusCharset
 */
unsigned char nocase_tab[unknown_charset][256] = {

    /* ASCII: identity in the top half (to let "bad" characters pass through;
     * same as latin1 in the bottom half. This makes it safe as a fallback for UTF8. */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,192,193,194,195,196,197,198,199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin 1 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin2 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

      91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
     101,102,103,104,105,106,107,108,109,110,
     111,112,113,114,115,116,117,118,119,120,
     121,122,123,124,125,126,127,128,129,130,
     131,132,133,134,135,136,137,138,139,140,
     141,142,143,144,145,146,147,148,149,150,
     151,152,153,154,155,156,157,158,159,
                                         160, /* 160-175 -> x+16 (selected only!) */
     177,162,179,164,181,182,167,168,185,186,
     187,188,173,190,191,
                         176,177,178,179,180,
     181,182,183,184,185,186,187,188,189,190,
     191,
         224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
     233,234,235,236,237,238,239,240,241,242,
     243,244,245,246,215,248,249,250,251,252,
     253,254,
             223,224,225,226,227,228,229,230,
     231,232,233,234,235,236,237,238,239,240,
     241,242,243,244,245,246,247,248,249,250,
     251,252,253,254,255
    },
    /* latin3 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* 160-175 -> x+16 (selected only!) */
      177,162,163,164,165,182,167,168,185,186,
      187,188,173,174,191,
                          176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
      /* note, some of the above characters don't actually exist... */
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin4 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* 160-175 -> x+16 (selected only!) */
      177,162,179,164,181,182,167,168,185,186, /* kra@162 has no capitalisation... */
      187,188,173,190,175,
                          176,177,178,179,180,
      181,182,183,184,185,186,187,188,191,190, /* exception: eng+=2 */
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* cyrillic */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,

      241,242,243,244,245,246,247,248,249,250, /* 161-175 += 80, except 173 */
      251,252,173,254,255,
                          208,209,210,211,212, /* 176-207 += 32 */
      213,214,215,216,217,218,219,220,221,222,
      223,224,225,226,227,228,229,230,231,232,
      233,234,235,236,237,238,239,
                                  208,209,210, /* normal */
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* arabic : Arabic lacks case, so map everything in upper half to itself. */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,192,193,194,195,196,197,198,199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* greek */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,
          220,183,221,222,223,187,252,189,253, /* misc uppercase */
      254,192,
              225,226,227,228,229,230,231,232, /* 193-219 += 32 (main uppercase) */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,247,248,249,250,251,
                                          220, /* normal */
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* hebrew : Hebrew lacks case, so map everything in upper half to itself. */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,192,193,194,195,196,197,198,199,200,
      201,202,203,204,205,206,207,208,209,210,
      211,212,213,214,215,216,217,218,219,220,
      221,222,223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin5 : same algorithm as latin1
     * (differences still at case offset of 32) */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,160,
      161,162,163,164,165,166,167,168,169,170,
      171,172,173,174,175,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin6 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* 160-175 += 16 but not 160,167,173 */
      177,178,179,180,181,182,167,184,185,186,
      187,188,173,190,191,
                          176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin7 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* 3 random chars += 16 in this range */
      161,162,163,164,165,166,167,184,169,186,
      171,172,173,174,191,176,177,178,179,180,
      181,182,183,184,185,186,187,188,189,190,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin8 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* odd mix of +=1, +=16, etc */
      162,162,163,165,165,171,167,184,169,186,
      171,188,173,174,255,177,177,179,179,181,
      181,182,185,184,185,186,191,188,190,190,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 INC 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,247,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* latin9 */
    {
        0,
        1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
       31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64,
                       97, 98, 99,100,101,102, /* ABCDEF -> abcdef */
      103,104,105,106,107,108,109,110,111,112, /* GHIJKLMNOP -> ghijklmnop */
      113,114,115,116,117,118,119,120,121,122, /* QRSTUVWXYZ -> qrstuvwxyz */

       91, 92, 93, 94, 95, 96, 97, 98, 99,100, /* normal */
      101,102,103,104,105,106,107,108,109,110,
      111,112,113,114,115,116,117,118,119,120,
      121,122,123,124,125,126,127,128,129,130,
      131,132,133,134,135,136,137,138,139,140,
      141,142,143,144,145,146,147,148,149,150,
      151,152,153,154,155,156,157,158,159,
                                          160, /* random oddities in this block */
      161,162,163,164,165,168,167,168,169,170,
      171,172,173,174,175,176,177,178,179,184,
      181,182,183,184,185,186,187,189,189,255,
      191,
          224,225,226,227,228,229,230,231,232, /* 192-222 -> x+32 but not 215 */
      233,234,235,236,237,238,239,240,241,242,
      243,244,245,246,215,248,249,250,251,252,
      253,254,
              223,224,225,226,227,228,229,230,
      231,232,233,234,235,236,237,238,239,240,
      241,242,243,244,245,246,247,248,249,250,
      251,252,253,254,255
    },
    /* UTF8: a dummy table which should never be used. The big wall
     * of zeroes should function as a reminder of this! */
    {
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    }
};  /* end initialise: nocase tables */

/*
 * now we have "checktables": contain booleans answering questions about
 * the characters. These should never be accessed except via the dedicated functions.
 */

unsigned char checktable_is_alphanum[unknown_charset][256] = {
    {
        /* ascii :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xc... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xd... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xe... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0  /* 0xf... */
    },
    {
        /* latin 1 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1  /* 0xf... */
    },
    {
        /* latin 2 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,1,0,1,0,1,1,0,0,1,1,1,1,0,1,1, /* 0xa... */
        0,1,0,1,0,1,1,0,0,1,1,1,1,0,1,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0  /* 0xf... */
    },
    {
        /* latin 3 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,1,0,0,0,0,1,0,0,1,1,1,1,0,0,1, /* 0xa... */
        0,1,0,0,0,0,1,0,0,1,1,1,1,0,0,1, /* 0xb... */
        1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0  /* 0xf... */
    },
    {
        /* latin 4 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,1,1,1,0,1,1,0,0,1,1,1,1,0,1,0, /* 0xa... */
        0,1,0,1,0,1,1,0,0,1,1,1,1,1,1,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0  /* 0xf... */
    },
    {
        /* cyrillic :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1, /* 0xa... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1  /* 0xf... */
    },
    {
        /* arabic :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0  /* 0xf... */
    },
    {
        /* greek :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,1,0,1,1,1,0,1,0,1,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0  /* 0xf... */
    },
    {
        /* hebrew :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xc... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0  /* 0xf... */
    },
    {
        /* latin 5 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1  /* 0xf... */
    },
    {
        /* latin 6 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1, /* 0xa... */
        0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1  /* 0xf... */
    },
    {
        /* latin 7 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1, /* 0xa... */
        0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0  /* 0xf... */
    },
    {
        /* latin 8 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,1,1,0,1,1,1,0,1,0,1,1,1,0,0,1, /* 0xa... */
        1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1  /* 0xf... */
    },
    {
        /* latin 9 :
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,0, /* 0xb... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* 0xd... */
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xe... */
        1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1  /* 0xf... */
    },
    {
        /* utf8 : dummy, same as ascii, but SHOULD NEVER BE QUERIED
        0 1 2 3 4 5 6 7 8 9 a b c d e f */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x0... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x1... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x2... */
        1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* 0x3... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x4... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x5... */
        0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x6... */
        1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 0x7... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x8... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x9... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xc... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xd... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xe... */
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0  /* 0xf... */
    }
};



/* endof massive character data built-ins! */

/* following are old versions, should not be needed any longer */

/* *
 * Table which translates latin-1 characters to lowercase.
 *
 * Use cl_string_maptable to access.
 * @see cl_string_maptable
 * /
unsigned char latin1_nocase_tab[256] = {
    0,  
    1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
   11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
   21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
   31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
   51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
   61, 62, 63, 64,
                   97, 98, 99,100,101,102, / * ABCDEF -> abcdef * /
  103,104,105,106,107,108,109,110,111,112, / * GHIJKLMNOP -> ghijklmnop * /
  113,114,115,116,117,118,119,120,121,122, / * QRSTUVWXYZ -> qrstuvwxyz * /
  
   91, 92, 93, 94, 95, 96, 97, 98, 99,100, / * normal * /
  101,102,103,104,105,106,107,108,109,110,
  111,112,113,114,115,116,117,118,119,120,
  121,122,123,124,125,126,127,128,129,130,
  131,132,133,134,135,136,137,138,139,140,
  141,142,143,144,145,146,147,148,149,150,
  151,152,153,154,155,156,157,158,159,160,
  161,162,163,164,165,166,167,168,169,170,
  171,172,173,174,175,176,177,178,179,180,
  181,182,183,184,185,186,187,188,189,190,
  191,
      224,225,226,227,228,229,230,231,232, / * 192-222 -> x+32 but not 215 * /
  233,234,235,236,237,238,239,240,241,242,
  243,244,245,246,215,248,249,250,251,252,
  253,254,
          223,224,225,226,227,228,229,230,
  231,232,233,234,235,236,237,238,239,240,
  241,242,243,244,245,246,247,248,249,250,
  251,252,253,254,255
};

/ **
 * Table which translates latin-1 characters
 * with diacritics to their [A-Za-z] "equivalents",
 * including s-set->s, thorn->t
 *
 * Use cl_string_maptable to access.
 * @see cl_string_maptable
 * /
unsigned char latin1_nodiac_tab[256] = {
    0,  
    1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
   11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
   21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
   31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
   51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
   61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
   71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
   81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
   91, 92, 93, 94, 95, 96, 97, 98, 99,100,
  101,102,103,104,105,106,107,108,109,110,
  111,112,113,114,115,116,117,118,119,120,
  121,122,123,124,125,126,127,128,129,130,
  131,132,133,134,135,136,137,138,139,140,
  141,142,143,144,145,146,147,148,149,150,
  151,152,153,154,155,156,157,158,159,160,
  161,162,163,164,165,166,167,168,169,170,
  171,172,173,174,175,176,177,178,179,180,
  181,182,183,184,185,186,187,188,189,190,
  191,
       65, 65, 65, 65, 65, 65, 65, 67, 69, / * uppercase * /
   69, 69, 69, 73, 73, 73, 73, 68, 78, 79,
   79, 79, 79, 79,215, 79, 85, 85, 85, 85,
   89, 84,115,                  / * thorn -> 'T', szlig -> 's' * /
               97, 97, 97, 97, 97, 97, 97, / * lowercase * /
   99,101,101,101,101,105,105,105,105,100,
  110,111,111,111,111,111,247,111,117,117,
  117,117,121,116,121
};

/ *
 * Table which translates cp-1251 (ASCII +
 * cyrillic) characters to lowercase
 *
 * Use cl_string_maptable to access.
 * @see cl_string_maptable
 *
unsigned char cp1251_nocase_tab[256] = {
    0,  
    1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
   11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
   21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
   31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
   51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
   61, 62, 63, 64,
                   97, 98, 99,100,101,102, / * ABCDEF -> abcdef * /
  103,104,105,106,107,108,109,110,111,112, / * GHIJKLMNOP -> ghijklmnop * /
  113,114,115,116,117,118,119,120,121,122, / * QRSTUVWXYZ -> qrstuvwxyz * /
  
   91, 92, 93, 94, 95, 96, 97, 98, 99,100, / * normal * /
  101,102,103,104,105,106,107,108,109,110,
  111,112,113,114,115,116,117,118,119,120,
  121,122,123,124,125,126,127,144,131,130,
  131,132,133,134,135,136,137,154,139,140,
  157,158,159,144,145,146,147,148,149,150,
  151,152,153,154,155,156,157,158,159,160,
  162,162,188,164,180,166,167,184,169,186,
  171,172,173,174,191,176,177,179,179,180,
  181,182,183,184,185,186,187,188,190,190,
  191,224,225,226,227,228,229,230,231,232,
  233,234,235,236,237,238,239,240,241,242,
  243,244,245,246,247,248,249,250,251,252,
  253,254,255,224,225,226,227,228,229,230,
  231,232,233,234,235,236,237,238,239,240,
  241,242,243,244,245,246,247,248,249,250,
  251,252,253,254,255
};

/ * cp-1251 (ASCII + cyrillic) diacritic-stripping is just the identity mapping */


/*
***
unsigned char ascii_nocase_tab[256] = {
    0,  
    1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
   11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
   21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
   31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
   51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
   61, 62, 63, 64,
                   97, 98, 99,100,101,102, 
  103,104,105,106,107,108,109,110,111,112, 
  113,114,115,116,117,118,119,120,121,122, 
  
   91, 92, 93, 94, 95, 96, 97, 98, 99,100, 
  101,102,103,104,105,106,107,108,109,110,
  111,112,113,114,115,116,117,118,119,120,
  121,122,123,124,125,126,127,144,131,130,
  131,132,133,134,135,136,137,154,139,140,
  157,158,159,144,145,146,147,148,149,150,
  151,152,153,154,155,156,157,158,159,160,
  162,162,188,164,180,166,167,184,169,186,
  171,172,173,174,191,176,177,179,179,180,
  181,182,183,184,185,186,187,188,190,190,
  191,192,193,194,195,196,197,198,199,200,
  201,202,203,204,205,206,207,208,209,210,
  211,212,213,214,215,216,217,218,219,220,
  221,222,223,224,225,226,227,228,229,230,
  231,232,233,234,235,236,237,238,239,240,
  241,242,243,244,245,246,247,248,249,250,
  251,252,253,254,255
};
****/

/*
***
unsigned char binary_nocase_tab[256] = {
    0,  
    1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
   11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
   21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
   31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
   41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
   51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
   61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
   71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
   81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
   91, 92, 93, 94, 95, 96, 97, 98, 99,100,
  101,102,103,104,105,106,107,108,109,110,
  111,112,113,114,115,116,117,118,119,120,
  121,122,123,124,125,126,127,144,131,130,
  131,132,133,134,135,136,137,154,139,140,
  157,158,159,144,145,146,147,148,149,150,
  151,152,153,154,155,156,157,158,159,160,
  162,162,188,164,180,166,167,184,169,186,
  171,172,173,174,191,176,177,179,179,180,
  181,182,183,184,185,186,187,188,190,190,
  191,192,193,194,195,196,197,198,199,200,
  201,202,203,204,205,206,207,208,209,210,
  211,212,213,214,215,216,217,218,219,220,
  221,222,223,224,225,226,227,228,229,230,
  231,232,233,234,235,236,237,238,239,240,
  241,242,243,244,245,246,247,248,249,250,
  251,252,253,254,255
};
****/

/* *
 * Table with identity mapping of latin-1 characters
 * (no flags)
 *
 * Use cl_string_maptable to access.
 * @see cl_string_maptable
 * /
unsigned char latin1_identity_tab[256];
int latin1_identity_tab_init = 0;

/ **
 * Table with mapping for the %cd flag for latin-1
 * (no case, no diacritics).
 *
 * Use cl_string_maptable to access.
 * @see cl_string_maptable
 * /
unsigned char latin1_nocase_nodiac_tab[256];
int latin1_nocase_nodiac_tab_init = 0;*/



/**
 * Initialise an "identity" mapping table.
 */
void
maptable_init_identity(unsigned char *maptable)
{
  int i;
  for (i = 0; i < 256; i++)
    maptable[i] = i;
}

/**
 * Initialise a "fold both case and diacritics" mapping table
 */
void
maptable_init_both(unsigned char *maptable,
                   const unsigned char *nocasetable,
                   const unsigned char *nodiactable)
{
  int i;
  for (i = 0; i < 256; i++) {
    maptable[i] = nocasetable[nodiactable[i]];
    if (maptable[i] != nodiactable[nocasetable[i]]) {
      Rprintf("CL: tables inconsistent for #%d -> #%d\n", i, maptable[i]);
    }
  }
}

/**
 * Gets a specified character mapping table for use in regular expressions.
 *
 * Returns pointer to static mapping table for given flags (IGNORE_CASE and
 * IGNORE_DIAC) and character set.
 *
 * Removed from the public API for 3.2.0 because there's no way for it to work
 * if the CorpusCharset is UTF8. Prototype moved to special-chars.h
 *
 * Tables exist for all character sets, but for all except Latin1 and ASCII, they are
 * currently identical to the ASCII tables (i.e. the awareness of case/accent
 * relationships in the upper half of each character set have not yet been inserted).
 *
 * @param charset  The character set of this corpus. Currently ignored.
 * @param flags    The flags that specify which table is required.
 *                 Can be IGNORE_CASE and/or IGNORE_DIAC.
 * @return         Pointer to the appropriate mapping table. DO NOT FREE this,
 *                 or modify it, it is a CL-internal data blob.
 */
unsigned char *
cl_string_maptable(CorpusCharset charset, int flags)
{
  int icase = (flags & IGNORE_CASE) != 0;
  int idiac = (flags & IGNORE_DIAC) != 0;

  if (charset == utf8) {
    Rprintf("CL: major error, cl_string_maptable called with invalid charset (UTF8).\n"
                    "    Mapping tables for ASCII have been supplied, but this means any \n"
                    "    characters outside the ASCII range will NOT be correct!\n");
    charset = ascii;
  }

  if (icase && idiac) {
    if (! nocase_nodiac_tab_init[charset]) {
      maptable_init_both(nocase_nodiac_tab[charset], nocase_tab[charset], nodiac_tab[charset]);
      nocase_nodiac_tab_init[charset] = 1;
    }
    return nocase_nodiac_tab[charset];
  }
  else if (icase) {
    return nocase_tab[charset];
  }
  else if (idiac) {
    return nodiac_tab[charset];
  }
  else {
    if (! identity_tab_init[charset]) {
      maptable_init_identity(identity_tab[charset]);
      identity_tab_init[charset] = 1;
    }
    return identity_tab[charset];
  }
  /*
   * old version of code follows...
   *
  if (icase && idiac) {
    if (! latin1_nocase_nodiac_tab_init) {
      maptable_init_both(latin1_nocase_nodiac_tab, latin1_nocase_tab, latin1_nodiac_tab);
      latin1_nocase_nodiac_tab_init = 1;
    }
    return latin1_nocase_nodiac_tab;
  } 
  else if (icase) {
    return latin1_nocase_tab;
  }
  else if (idiac) {
    return latin1_nodiac_tab;
  }
  else {
    if (! latin1_identity_tab_init) {
      maptable_init_identity(latin1_identity_tab);
      latin1_identity_tab_init = 1;
    }
    return latin1_identity_tab;
  }
  end old version */
}

/**
 * Replaces any invalid control characters in a string.
 *
 * "Invalid" control characters are any below 0x20.
 *
 * The string is modified in situ. A typical "replace" to use would be '?'
 * to match the action of cl_string_validate_encoding.
 *
 * @param s             The string to modify.
 * @param charset       The character set of the string.
 * @param replace       The replacement character to use. If this is 0, the
 *                      character is deleted rather than replaced.
 * @param zap_tabs      Whether or not tabs should be zapped (boolean).
 * @param zap_newlines  Whether or not \n and \r should be zapped (boolean).
 * @return              The number of characters replaced/deleted in the string.
 */
int
cl_string_zap_controls(char *s, CorpusCharset charset, char replace, int zap_tabs, int zap_newlines)
{
  unsigned char *str = (unsigned char *)s;
  int i;
  /* number of replacements made */
  int num = 0;
  int zappable[0x20] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1};

  /* set flags */
  zappable[0x09] = (zap_tabs ? 1 : 0);
  zappable[0x0d] = zappable[0x0a] = (zap_newlines ? 1 : 0);
  /* all other C0 controls are always zappable */

  /* we don't currently do anything with charset because all CWB character sets are ascii-compatible.
     But the parameter is retained in case of a theoretical future charset that isn't.  */

  for (; *str ; str++)
    if (*str < 0x20 && zappable[*str]) {
      num++;
      if (replace)
        *str = replace;
      else
        /* it is safe to do a bare down-copy because
         * the C0s are all single-byte under UTF-8 */
        for (i = 0 ; (str[i] = str[i+1]) ; i++)
          ;
    }
  return num;
}



/**
 * Checks whether a given byte is a UTF-8 continuation byte.
 *
 * @byte    Byte to check.
 * @return  Boolean. True iff the byte is a continuation byte.
 *          If it is a one-byte character, or a valid start byte, false.
 */
int
cl_string_utf8_continuation_byte(unsigned char byte)
{
  return (byte >= 0x80 && byte <=0xbf);
}

/*
 * Charset sensitive strlen wrapper. If the charset is utf8, calls
 * a Glib function that calculates the size in characters. Otherwise,
 * returns the size in bytes.
 */
size_t
cl_charset_strlen(CorpusCharset charset, char *s)
{
  return (charset == utf8 ? (size_t) g_utf8_strlen((gchar *)s, -1) : strlen(s));
}


/**
 * Checks the encoding of a string.
 *
 * This function looks for bad bytes (or byte sequences in the case of UTF8);
 * if any are present, it judges the string invalid.
 *
 * The string can optionally be "repaired" in-place by replacing bad bytes with
 * '?' characters.  If the "repair" is successful, the function returns True.
 *
 * What counts as "bad" is of course relative to the character set that the
 * string is encoded in - so this must be specified.
 *
 * Repairing never changes the length of the string in bytes
 * (in UTF-8, *each* byte in a bad sequence is changed to a '?').
 *
 * @param s        Null-terminated string to check.
 * @param charset  CorpusCharset of the string's encoding.
 * @param repair   if True, replace invalid bytes by '?'
 * @return         Boolean: true for valid, false for invalid.
 */
int
cl_string_validate_encoding(char *s, CorpusCharset charset, int repair)
{
  /* cast as unsigned string to allow hex comparisons (but pass signed version to Glib for UTF8) */
  unsigned char *str = (unsigned char *)s;
  unsigned char *bad; /* for Glib out parameter */

  switch (charset) {
  case utf8:
    do {
      if (g_utf8_validate((gchar *)str, -1, (const gchar **)&bad))
        return 1;
      else if (!repair)
        return 0;
      else /* invalid and repair is true */
        *bad = '?';
        /* and continue ... */
      /* note this will loop until all bad bytes in the string have been overwritten */
    } while (1);
    break;

  /* all the others are of the same pattern:
     check each character in string, if in illegal zone, return false or overwrite with '?';
     we short-circuit the tests where possible. */
  case ascii:
    for (; *str ; str++)
      if ( *str > 0x7f ) {
        if (repair) 
          *str = '?';
        else
          return 0;
      }
    break;

  /* character sets where anything is OK except 0x80 to 0x9f, like Latin1 */
  case latin1:
  case latin2:
  case latin4:
  case latin5:
  case latin6:
  case latin7:
  case latin8:
  case latin9:
  case cyrillic:
    for (; *str ; str++)
      if ( (*str > 0x7f && *str < 0xa0)) {
        if (repair) 
          *str = '?';
        else
          return 0;
      }
    break;

  /* latin3 has extra non-allowed characters */
  case latin3:
    for (; *str ; str++)
      if ( *str > 0x7f
              && (   *str <  0xa0
                  || *str == 0xa5
                  || *str == 0xae
                  || *str == 0xbe
                  || *str == 0xc3
                  || *str == 0xd0
                  || *str == 0xe3
                  || *str == 0xf0
                 )
         ) {
        if (repair) 
          *str = '?';
        else
          return 0;
      }
    break;

  /* so does Greek! */
  case greek:
    for (; *str ; str++)
      if ( *str > 0x7f
              && (   *str <  0xa0 
                  || *str == 0xae
                  || *str == 0xd2
                  || *str == 0xff
                 )
         ) {
        if (repair) 
          *str = '?';
        else
          return 0;
      }
    break;

  /* hebrew has a few more complexities */
  case hebrew:
    for (; *str ; str++)
      if (*str > 0x7f
              && (    *str <  0xa0
                  ||  *str == 0xa1
                  || (*str >= 0xbf && *str <= 0xde)
                  ||  *str == 0xfb
                  ||  *str == 0xfc
                  ||  *str == 0xff
                 )
         ) {
        if (repair) 
          *str = '?';
        else
          return 0;
      }
    break;

  /* arabic has great swathes of non-allowed characters */
  /* (this may not be the most efficient set of tests)  */
  case arabic:
    for (; *str ; str++)
      if (*str >  0x7f
              && (    *str <  0xa0
                  ||  *str == 0xa1
                  ||  *str == 0xa2
                  ||  *str == 0xa3
                  || (*str >= 0xa5 && *str <= 0xac)
                  || (*str >= 0xae && *str <= 0xba)
                  ||  *str == 0xbc
                  ||  *str == 0xbd
                  ||  *str == 0xbe
                  ||  *str == 0xc0
                  || (*str >= 0xdb && *str <= 0xf)
                  ||  *str >= 0xf3
                 )
         ) {
        if (repair) 
          *str = '?';
        else
          return 0;
      }
    break;

  default: /* unknown_charset, etc. */
    Rprintf("CL: Error, unrecognised CorpusCharset in cl_string_validate_encoding.\n");
    return 0;

  } /* end switch */

  /* if we've made it to here, the string is OK; return true. */
  return 1;
}

/**
 * Creates a "backwards" version of the specified string.
 *
 * The memory for the reversed string is newly allocated.
 * (This is potentially wasteful, but it occurs in the
 * depths of GLib, so short of reinventing the wheel we
 * have to live with it.)
 *
 * @param s        String to reverse.
 * @param charset  The character set of the string.
 * @return         Pointer to the new string.
 */
char *
cl_string_reverse(const char *s, CorpusCharset charset)
{
  char *reversed;

  if (charset != utf8) {
    reversed = cl_strdup((char *)s);
    g_strreverse((gchar *)reversed);
  }
  else {
    reversed = (char *)g_utf8_strreverse((gchar *)s, -1);
  }
  return reversed;
}

/**
 * Removes all trailing CR and LF characters from specified string (in-place).
 *
 * The main purpose of this function is to remove trailing line breaks from input
 * lines regardless of whether a text file is in Unix (LF) or Windows (CR-LF) format.
 * All text input except for simple numeric data should be passed through cl_string_chomp().
 *
 * @param s     String to chomp (modified in-place).
 */
void
cl_string_chomp(char *s) {
  char *point = s;
  /* advance point to NUL terminator */
  while (*point)
    point++;
  point--; /* now points at last byte of string */
  /* delete CR and LF, but don't move beyond start of string */
  while (point >= s && (*point == '\r' || *point == '\n')) {
    *point = '\0';
    point--;
  }
}

/**
 * Compares two strings in a qsort-style
 *
 * This function is designed to be suitable for use as a callback
 * with qsort(). As such, its return values are negative if s1 is "less than"
 * s2; zero if the two strings are the same; and positive if s2 is "greater
 * than" s2. But of course you can also use it on its own.
 *
 * You cannot use it directly with qsort as its parameters are wrong. It
 * needs to be wrapped in another function that (at least) provides the
 * charset, flags and reverse arguments (e.g. from global variables or by
 * calling other functions).
 *
 * The two strings must be in the same character set. Both will be made
 * canonical in accordance with the flags argument if it is set. Also, the
 * comparison can be done on reverse-order strings.
 *
 * Note that if either flags or reverse is non-zero, then memory allocation
 * will be necessary. If you are calling this function in a loop, that
 * could quickly get costly. To avoid this, a pair of one-time-allocated
 * buffers are used - but this doesn't dispense with all need for allocation.
 * [Another option would be to allow a buffer to be optionally supplied....]
 *
 * If charset == utf8 and strings are passed in from external sources, the
 * flag REQUIRE_NFC should always be specified to obtain consistent results.
 *
 * @param s1       First string to compare.
 * @param s2       Second string to compare.
 * @param charset  Character set of the two strings.
 * @param flags    IGNORE_CASE, IGNORE_DIAC, REQUIRE_NFC
 * @param reverse  Boolean: if true, strings are compared from end to beginning,
 *                 rather than beginning to end.
 * @return         0 if the strings are the same. 1 if s1 is greater.
 *                 -1 if s2 is greater.
 */
int
cl_string_qsort_compare(const char *s1,
                        const char *s2,
                        CorpusCharset charset,
                        int flags,
                        int reverse)
{
  static char *buffer1;
  static char *buffer2;
  static int   buffers_allocated = 0;

  const char *comp1;
  const char *comp2;

  /* preparatory string manipulation... */
  if (!flags && !reverse) {
    comp1 = s1;
    comp2 = s2;
  }
  else {

    /* allocate the static buffers once and for all */
    if (! buffers_allocated) {
      /* a normalised string cannot possibly be longer than (CL_MAX_LINE_LENGTH * 2) */
      buffer1 = (char *) cl_malloc(CL_MAX_LINE_LENGTH * 2);
      buffer2 = (char *) cl_malloc(CL_MAX_LINE_LENGTH * 2);
      buffers_allocated = 1;
      /* alternative would be to allocate 2 * strlen(s1 or s2), and reallocate
       * whenever more is needed */
      /* note also that this memory will NEVER be freed before the program exits.*/
    }

    strcpy(buffer1, s1);
    strcpy(buffer2, s2);

    /* canonicalise BEFORE reversing (may not work as expected after reversing utf8) */
    if (flags) {
      cl_string_canonical(buffer1, charset, flags, CL_MAX_LINE_LENGTH * 2);
      cl_string_canonical(buffer2, charset, flags, CL_MAX_LINE_LENGTH * 2);
    }
    if (reverse) {
      /* cl_string_reverse() unnecessarily allocates memory for non-UTF8 to provide a consistent API,
       * so we call the GLib functions directly here
       */
      if (charset == utf8) {
        char *temp;
        /* note we cannot use cl_strcpy() because the limit is CL_MAX_LINE_LENGTH * 2 */
        strcpy(buffer1, (temp = g_utf8_strreverse(buffer1, -1)));
        cl_free(temp);
        strcpy(buffer2, (temp = g_utf8_strreverse(buffer2, -1)));
        cl_free(temp);
      }
      else {
        g_strreverse(buffer1);
        g_strreverse(buffer2);
      }
    }
    /* in either case, we compare the buffers, not the orig string */
    comp1 = buffer1;
    comp2 = buffer2;
  }
  /* at this point, straight comparison is all we need */

  /* Versions of CWB prior to 3.4.10 used an explicit unsigned char string comparison for 8-bit
   * character sets (which should be identical to strcmp() on all known platforms) and g_utf8_collate
   * for UTF-8 (which is locale-dependent and hence unpredictable).
   * Since the most important goal is to ensure a consistent and well-defined sort order, we now
   * use plain strcmp() in both cases. While the C standard fails to specify the precise behaviour,
   * all known platforms use unsigned characters for the comparison.
   */
  return strcmp(comp1, comp2);

  /* Below is the old code for reference -- or to marvel at the inefficiency of the strcmp reimplementation.
   */
  //
  //  /* the actual comparison begins here. There are two versions:
  //   * 8-bit (binary compare) and utf8 (depends on GLib and on the current locale). */
  //  if (charset != utf8) {
  //    /* 8 bit mode */
  //    int l1, l2, minl, i;
  //    unsigned char *p1, *p2;
  //
  //    l1 = strlen(comp1);
  //    l2 = strlen(comp2);
  //    /* pointers set to first character of the string */
  //    p1 = (unsigned char *)comp1;
  //    p2 = (unsigned char *)comp2;
  //
  //    minl = MIN(l1, l2);
  //
  //    /* count up to minimum length with i, also increment pointers */
  //    for (i = 1; i <= minl; i++, p1++, p2++) {
  //      /* if there is a difference (based on binary order
  //       * of *p1 and *p2) return it */
  //      if (*p1 < *p2)
  //        return -1;
  //      else if (*p1 > *p2)
  //        return 1;
  //    }
  //    /* if we're here, then the whole minl length was the same */
  //    if (l1 < l2)
  //      return -1;
  //    else if (l1 > l2)
  //      return 1;
  //    else
  //      return 0;
  //  }
  //  else {
  //    /* utf8 mode */
  //    int result = (int)g_utf8_collate((gchar *)comp1, (gchar *)comp2);
  //    /* For now we are using the collate function from GLib.
  //     * In practice, this may not be appropriate as it is locale-dependent -
  //     * so, for example, it may impose case-insensitivity or accent-insensitivity
  //     * if the locale says it is appropriate to do so,
  //     * even if we avoided these things above!
  //     *
  //     * This is because GLib does not do its own collation, but passes off to
  //     * strcoll() or to wcscoll() (or at least, a poke at GLib's internals certainly
  //     * suggests that's what's going on) -- which are opaque and may vary across systems.
  //     * All GLib does is wrap around these functions to standardise how they need to be
  //     * called.
  //     * Would binary comparison for UTF8 (as with the 8 bit charsets) be better?
  //     * It would give odd results for sorting (e.g. Urdu letters after entire
  //     * Arabic alphabet; all Latin1 accented characters at the end of the
  //     * alphabet) but we wouldn't have to worry about a-acute and a-grave being
  //     * grouped together for counting purposes.
  //     *
  //     * Offer utf8_as_binary as an option to this function?????
  //     */
  //    if (result < 0)
  //      return -1;
  //    else if (result > 0)
  //      return 1;
  //    else
  //      return 0;
  //  }
}

/**
 * Checks a string to see if it is a valid CWB identifier.
 *
 * The rules for these are as follows (see also the CQP lexer):
 *
 * * all characters must be ASCII, ie less than 0x80;
 * * must be at least 1 character long (of course)
 * * first character must be an uppercase or lowercase letter or underscore
 * * second and subsequent characters may also be digits, hyphen or fullstop.
 * * mixed case is allowed (just-upper and just-lower is imposed elsewhere,
 *   where necessary).
 *
 * TODO: should the CL registry lexer be amended to reflect these restricitons?
 * (ID there is rather laxer than this)
 *
 * @param s   The string to check.
 * @return    A boolean. True if the string is a valid ID. Otherwise false.
 */
int
cl_id_validate(char *s)
{
  if (s == NULL)
    return 0;
  /* check first char */
  if ( ! (
         (*s >= 'a' && *s <= 'z')
      || (*s >= 'A' && *s <= 'Z')
      || *s == '_'
      ) )
    return 0;

  /* check other chars */
  while (*(++s))
    if ( ! (
           (*s >= 'a' && *s <= 'z')
        || (*s >= 'A' && *s <= 'Z')
        ||  *s == '_'
        ||  *s == '.'
        ||  *s == '-'
        || (*s >= '0' && *s <= '9')
        ) )
      return 0;

  return 1;
}

/**
 * Converts a lowercase corpus name to an equivalent uppercase form.
 *
 * String is modified in situ. Only the ASCII characters are changed.
 *
 * Note, this function doesn't check for what is and is not an allowed
 * CWB-corpus-name character.
 *
 * The old version of this code was a line in cwb-encode that used
 * the library toupper to cope with Latin1 characters. But these are no
 * longer allowed in identifiers, which must be ASCII only.
 */
void
cl_id_toupper(char *s)
{
  int i;
  i = strlen(s) - 1;
  while (i >= 0) {
    if (s[i] >= 'a' && s[i] <= 'z')
      s[i] -= 0x20;
    i--;
  }
}

/**
 * Converts an uppercase corpus name to an equivalent lowercase form.
 *
 * String is modified in situ. Only the ASCII characters are changed.
 *
 * Note, this function doesn't check for what is and is not an allowed
 * CWB-corpus-name character.
 */
void
cl_id_tolower(char * s)
{
  int i;
  i = strlen(s) - 1;
  while (i >= 0) {
    if (s[i] >= 'A' && s[i] <= 'Z')
      s[i] += 0x20;
    i--;
  }
}



/**
 * Converts a string to canonical form.
 *
 * The "canonical form" of a string is for use in comparisons where
 * case-insensitivity and/or diacritic insensitivity is desired.
 *
 * This function has two behaviours: inplace modification, or copy modification.
 *
 * INPLACE MODIFICATION:  the string s is modified in place, up to a maximum size of
 * inplace_bufsize-1 characters (plus NUL terminator). If the normalised string
 * doesn't fit into the buffer, the extra characters are dropped silently.
 * Ergo, for things to definitely work correctly, the buffer should have enough
 * memory to cope with any expansions made in Unicode case folding. Ideally,
 * allocate double the length of the string (since case-folding doesn't include
 * any one -> more-than-two mappings so far as we know). To use the function in this
 * mode, pass in the amount of memory available at s as the last argument.
 * In this mode, the return value is always equal to s.
 *
 * COPY MODIFICATION: the string s is modified; a newly-allocated copy is created
 * and modified; and a pointer to the copy is  returned.
 * It is then the caller's responsibility to free this memory.
 * To use the function in this mode, pass a value less than 1 as the last
 * argument (or the aide memoire constant CL_STRING_CANONICAL_STRDUP).
 *
 * FLAGS: IGNORE_DIAC and/or IGNORE_CASE, for diacritic/case folding resectively.
 * In UTF8, an additional flag REQUIRE_NFC can be passed to normalize the
 * string into the canonical pre-composed form (NFC) used internally by CWB.
 * All strings that are going to be inserted into or searched for within an
 * indexed corpus should be processed in this way.
 *
 * API HISTORY: the arguments of this string were changed in v3.2.1. Now,
 * a CorpusCharset is needed. This is because string canonicalising works
 * differently in UTF8, where case folding / accent folding is done by calling
 * Unicode-aware functions. By contrast, the process for 8-bit charsets just uses a
 * straightforward mapping table for both sorts of folding.
 *
 * The arguments were changed in 3.4.12 to add a fourth argument, inplace_bufsize:
 * this prevents buffer overflow bylimiting the amount of inplace-overwriting
 * that can be done.
 *
 * @see CL_STRING_CANONICAL_STRDUP
 *
 * @param s                The string.
 * @param charset          The character set in which the string is encoded.
 *                         If this is utf8, complex accent and/or case folding will be done,
 *                         as per the Unicode standard.
 *                         If it is anything else, internal byte mapping tables will be used.
 * @param flags            The flags that specify which conversions are required.
 *                         Can be IGNORE_CASE | IGNORE_DIAC | REQUIRE_NFC .
 * @param inplace_bufsize  Size of the buffer. If > 0, string s will be modified in place,
 *                         avoiding buffer overruns. If 0 or less, s will be left as is,
 *                         and a copy created, modified, and returned.
 *                         The constant CL_STRING_CANONICAL_STRDUP is provided to make this
 *                         more readable when calling (it's -1).
 * @return                 The canonical string. If inplace modification was used,
 *                         this will be the same as s. If not, it will be a newly allocated
 *                         string. If there is an error of any kind, the return will be
 *                         inplace/new as expected, but will not contain the requested
 *                         modifications (or will contain only some of them).
 */
char *
cl_string_canonical(char *s, CorpusCharset charset, int flags, int inplace_bufsize)
{
  int icase = (flags & IGNORE_CASE) != 0;
  int idiac = (flags & IGNORE_DIAC) != 0;
  int nfc   = (flags & REQUIRE_NFC) != 0;

  /* this function has two branches controlled by an if: (a) utf8, (b) everything else. */
  if (charset == utf8) {

    /* pointers for UTF8 processing */
    gchar *string = NULL;
    gchar *new_string = NULL;
    gchar *current_char;
    gchar *next_char_begins;

    /* GLib documentation insists that g_utf8_* functions must only be used on valid UTF-8 strings;
     * let's assume that a string passed without REQUIRE_NFC is from an internal source and hence safe */
    if (nfc && !g_utf8_validate((gchar *)s, -1, NULL)) {
      Rprintf("CL: major error, invalid UTF8 string passed to cl_string_canonical ...\n");
      return ( 0 < inplace_bufsize ? s : cl_strdup(s) );
    }

    /* UTF8 accent folding */
    if (idiac) {
      /* convert to decomposed normal form, then strip all combining characters */
      if (NULL == (string = g_utf8_normalize((gchar *)s, -1, G_NORMALIZE_NFD)) ) {
        Rprintf("CL: major error, cannot decompose string: invalid UTF8 string passed to cl_string_canonical...\n");
        return ( 0 < inplace_bufsize ? s : cl_strdup(s) );
      }

      for (current_char = string; *current_char != '\0'; /* increment is done in-loop */) {
        next_char_begins = g_utf8_next_char(current_char);
        if (g_unichar_ismark(g_utf8_get_char(current_char))) {
          /* downcopy to overwrite the mark character */
          cl_strcpy(current_char, next_char_begins);
          /* and keep current_char the same */
        }
        else
          current_char = next_char_begins;
      }
    }
    /* end of accent folding */
    else
      string = (gchar *)s;

    /* UTF8 pre-composed normal form (always needed after accent folding) */
    if (nfc || idiac) {
      new_string = g_utf8_normalize(string, -1, G_NORMALIZE_NFC);
      if (string != s)
        cl_free(string); /* free temporary string allocated by accent folding above */
      string = new_string;
      if (string == NULL) {
        Rprintf("CL: major error, cannot compose string: invalid UTF8 string passed to cl_string_canonical...\n");
        return ( 0 < inplace_bufsize ? s : cl_strdup(s) );
      }
    }

    /* UTF8 case folding */
    if (icase) {
      new_string = g_utf8_casefold(string, -1);
      if (string != s)
        cl_free(string); /* free temporary string allocated by one of the steps above */
      string = new_string;
    }

    if (string != s) {
      if (0 >= inplace_bufsize)
        return string; /* changes made: return already-allocated string */
      else {
        /* changes made : copy string back into input argument */
        int len = strlen(string);

        /* limit the number of bytes returned to avoid buffer overflow.
         * As this is UTF-8, we rewind the string to avoid partial chars. */
        if (inplace_bufsize <= len) {
          if (cl_string_utf8_continuation_byte(string[inplace_bufsize-1])){
            gchar *end = g_utf8_find_prev_char(string, string+inplace_bufsize-1);
            end = (NULL == end ? string : end);
            *end = '\0';
          }
          else
            string[inplace_bufsize-1] = '\0';
        }

        strcpy(s, string);
        cl_free(string);
        return s;
      }
    }
    else /* no changes made */
      return ( 0 < inplace_bufsize ? s : cl_strdup(s) ); /* no changes made: return original string or clone */

  }
  /* end chunk dealing with UTF8 normalisation */

  else {

    /* variables for non-UTF8 normalisation */
    register unsigned char *p, *maptable;
    char *duplicate;

    if (icase || idiac) { /* don't waste time if no relevant flags are specified */
      /* this function should in theory never be called with unknown_charset,
       * but if it is, treat it as ascii for current purposes. */
      if (charset == unknown_charset)
        charset = ascii;

      maptable = cl_string_maptable(charset, flags);

      if (0 < inplace_bufsize) {
        /* modify in place : cannot overflow, because all changes are one-for-one */
        for (p = (unsigned char *)s; *p; p++)
          *p = maptable[*p];
        return s;
      }
      else {
        /* modify and return a duplicate */
        duplicate = cl_strdup(s);
        for (p = (unsigned char *)duplicate; *p; p++)
          *p = maptable[*p];

        return duplicate;
      }
    }
    else
      /* return the unmodified string, or a duplicate, depending on inplace_bufsize. */
      return ( 0 < inplace_bufsize ? s : cl_strdup(s) );

  }
  /* end else for non-utf8 encodings */

  /* NOTREACHED */
  assert(0 && "Not reached");
  return NULL;
}


/**
 * Checks whether a character is alphanumeric in the given ISO-8859 character set.
 *
 * This function is exported but NOT via cl.h - it is only for the use of CWB utilities.
 * It is not part of the standard API.
 *
 * Returns false if charset is utf8.
 *
 * @param c        The character to check.
 * @param charset  The character set to check against.
 * @return         Boolean.
 */
int
cl_iso_char_is_alphanumeric(unsigned char c, CorpusCharset charset)
{
  if (charset == utf8)
    return 0;
  return (int) checktable_is_alphanum[charset][c];

#if 0
  /* previous version of the function when it was part of cwb-scan-corpus */
  return
    ( (c >= 'A' && c <= 'Z') ||
      (c >= 'a' && c <= 'z') ||
      (c >= 0xC0 && c <= 0xFF ));
/*      (c >= '\xfffd' && c <= '\xfffd') );
 * this line became corrupt when imported into the SVN due to non-ASCII characters (see above)
 * I replaced it with the hex values for capital-a-grave to little-y-diaresis,
 * having looked them up in earlier version of the code... -- AH 1/7/09
 * Which means that the condition could never be satisfied on a platform where char is signed
 * (because the non-ASCII characters have negative codes in this case); I've explicitly made the
 * function argument an "unsigned char" now, which should fix the problem. -- SE 18/08/09
 * (NB: there's still a harmless warning that "comparison is always true" for "c <= 0xFF")
 */
#endif
}

/**
 * Standardises subdirectory-dividers in a string that represents a path, in an
 * OS-sensitive way.
 *
 * If the CL was compiled for Unix, backslash is changed to forwardslash.
 * If the CL was compiled for Windows, forwardslash is changed to backslash.
 *
 * Note that the path is modified in place.
 *
 * @param path     The path to modify (must be Ascii-compatible)
 */
void
cl_path_adjust_os(char *path)
{
  for ( ; *path != '\0' ; path++ )
    if (*path == '/' || *path == '\\')
      *path = SUBDIR_SEPARATOR;
}

/**
 * Standardises subdirectory-dividers in a string that represents a path
 * into Unix-like form (ie with forward-slash), regardless of what OS
 * we are in.
 *
 * Or, to put it another way, changes backslashes into forward slashes
 * under Windows.
 *
 * This may be useful because of the need to move corpora between systems
 * - in which case, the paths need to be in '/' format -- Windows tolerates
 * forward slashes in paths a hell of a lot better than *nix tolerates
 * unescaped backslashes!
 *
 * Note that the path is modified in place.
 *
 * @param path     The path to modify (must be Ascii-compatible)
 */
void
cl_path_adjust_independent(char *path)
{
  for ( ; *path != '\0' ; path++ )
    if (*path == SUBDIR_SEPARATOR)
      *path = '/';
}

/**
 * Add quotes and escape slashes to a file path if necessary.
 *
 * This is for the HOME and INFO fields of the registry file.
 *
 * If either field contains any characters that can't be
 * treated as an "ID" token by the registry parser, then we
 * make sure it is treated as a string (quoted) instead, and make
 * all appropriate substitutions
 *
 * For consistency, this function always returns a newly
 * allocated string, regardless of whether changes have been made.
 *
 * Note that the way the registry parser works, it is quite happy
 * with either "C:\dir\subdir" or "C:\\dir\\subdir" as a path for
 * HOME or INFO.
 *
 * @param path  String containing the path to quotify.
 * @return      The quotified string (newly allocated).
 */
char *
cl_path_registry_quote(char *path)
{
  char *p, *q, *quoted_path;
  int need_quotes = 0;

  for (p = path; *p; p++) {
    if ((*p >= 'A' && *p <= 'Z') ||
        (*p >= 'a' && *p <= 'z') ||
        (*p >= '0' && *p <= '9') ||
        (*p == '-') || (*p == '_') || (*p == '/') ||
        (p > path && (*p == '.' || *p == '\\'))
       ) {
      /* pass */
    }
    else
      need_quotes = 1;
  }

  if (need_quotes) {
    int num_escapes = 0; /* double quotes and backslashes in path name need to be escaped */
    for (p = path; *p; p++) {
      if (*p == '"' || *p == '\\')
        num_escapes++;
    }
    quoted_path = (char *) cl_malloc(strlen(path) + num_escapes + 3);
    q = quoted_path;
    *q++ = '"';
    for (p = path; *p; p++, q++) {
      if (*p == '"' || *p == '\\')
        *q++ = '\\';
      *q = *p;
    }
    *q++ = '"';
    *q = '\0';
  }
  else {
    quoted_path = cl_strdup(path);
  }

  return(quoted_path);
}


/**
 * Tokenises a string into components split by ':' (or ';' under Win32).
 *
 * @param s  The string to tokenise; or, NULL if tokenisation has already been initialised.
 * @return   The next token from the string.
 * @see      PATH_SEPARATOR
 */
char *
cl_path_get_component(char *s)
{
  register int c;
  char *tok;
  static char *last;

  if (s == NULL && (s = last) == NULL)
    return (NULL);

  do {
    c = *s++;
  } while (c == PATH_SEPARATOR);

  if (c == 0) {           /* no non-delimiter characters */
    last = NULL;
    return (NULL);
  }
  tok = s - 1;

  for (;;) {
    c = *s++;

    if (c == PATH_SEPARATOR || c == '\0') {
      if (c == 0)
        s = NULL;
      else
        s[-1] = 0;
      last = s;
      return (tok);
    }
  }
  /* NOTREACHED */

  assert(0 && "Not reached");
  return NULL;
}

/**
 * Boolean switch enabling/disabling latex-style escapes.
 *
 * By default, it is false; if programs wish to allow these
 * escapes they need to offer some means of changing this
 * variable.
 *
 * Note that enabling this variable may cause scrambling of
 * the string for LatinX strings where X is not 1; and may cause
 * undefined errors for UTF8 strings. In short, you should
 * only activate it when you are working with a corpus whose
 * charset is Latin1.
 *
 * @see  CorpusCharset
 */
int cl_allow_latex2iso = 0;


/**
 * Converts ASCII strings with latex-style blackslash escapes
 * for accented characters to ISO-8859-1 (Latin-1).
 *
 * Syntax:
 *
 * \"[AaOoUus..] --> corresponding ISO 8859-1 character
 *
 * \{octal}      --> ISO 8859-1 character
 *
 * Note that if cl_allow_latex2iso is FALSE, this function will
 * simply copy the input to the output. So it is always safe to
 * call this function.
 *
 * @see               cl_allow_latex2iso
 * @param str         The string to convert.
 * @param result      The location to put the altered string (which
 *                    should be shorter, or at least no longer than,
 *                    the input string). If this parameter is NULL,
 *                    space is automatically allocated for the output.
 *                    result is allowed to be the same as str.
 * @param target_len  The maximum length of the target string. If
 *                    result is NULL, then this is deduced automatically.
 * @return            Pointer to the altered string (if result was NULL
 *                    you need to catch this and free it when no longer
 *                    needed).
 */
char *
cl_string_latex2iso(char *str, char *result, int target_len)
{
  /* the positions in the source and target strings */
  int src_pos = 0;
  int target_pos = 0;
  int i;

  char c;
  int val;

  /* do not allow latex-style escapes unless they are switched on in the global variable */
  if (! cl_allow_latex2iso) {
    if (result) {
      if (result != str)
        strcpy(result, str);
    }
    else
      result = cl_strdup(str);
    return result;
  }

/** @see cl_string_latex2iso */
#define popc(s,p) s[p++]
/** @see cl_string_latex2iso */
#define pushc(s,c,p,m) s[p++] = c; if (p>=m) goto endloop;

  if (result == NULL) {
    /* auto-allocate <result> if necessary; should be shorter than input string */
    target_len = strlen(str);
    result = (char *) cl_malloc(target_len + 1);
  }
  
  c = popc(str, src_pos);
  while ((c != '\0') && (target_pos < target_len)) {

    if (c != '\\') {
      pushc(result, c, target_pos, target_len);
      c = popc(str, src_pos);
    }
    else { /* we found a backslash */

      /* get the next character */
      c = popc(str, src_pos);

      if (isdigit(c) && isdigit(str[src_pos]) && isdigit(str[src_pos+1])) {
        val = 0;
        for (i = 0; i < 3; i++) {
          val = val * 8 + ((c - '0') % 8);
          c = popc(str, src_pos);
        }
        pushc(result, (char) (val % 256), target_pos, target_len);
      }
      else if (c == '"') {     /* diaresis / umlaut */
        switch ( c = popc(str, src_pos) ) {
        case 'A': pushc(result, 0xC4, target_pos, target_len); break;
        case 'E': pushc(result, 0xCB, target_pos, target_len); break;
        case 'I': pushc(result, 0xCF, target_pos, target_len); break;
        case 'O': pushc(result, 0xD6, target_pos, target_len); break;
        case 'U': pushc(result, 0xDC, target_pos, target_len); break;
        case 'a': pushc(result, 0xE4, target_pos, target_len); break;
        case 'e': pushc(result, 0xEb, target_pos, target_len); break;
        case 'i': pushc(result, 0xEF, target_pos, target_len); break;
        case 'o': pushc(result, 0xF6, target_pos, target_len); break;
        case 'u': pushc(result, 0xFc, target_pos, target_len); break;
        case 's': pushc(result, 0xDF, target_pos, target_len); break;
        default:   /* copy both */
          pushc(result, '"', target_pos, target_len);
          pushc(result, c,   target_pos, target_len);
          break;
        }
        c = popc(str, src_pos);
      }
      else if (c == '\'') {     /* accent aigu */
        switch ( c = popc(str, src_pos) ) {
        case 'A': pushc(result, 0xC1, target_pos, target_len); break;
        case 'E': pushc(result, 0xC9, target_pos, target_len); break;
        case 'I': pushc(result, 0xCD, target_pos, target_len); break;
        case 'O': pushc(result, 0xD3, target_pos, target_len); break;
        case 'U': pushc(result, 0xDA, target_pos, target_len); break;
        case 'a': pushc(result, 0xE1, target_pos, target_len); break;
        case 'e': pushc(result, 0xE9, target_pos, target_len); break;
        case 'i': pushc(result, 0xED, target_pos, target_len); break;
        case 'o': pushc(result, 0xF3, target_pos, target_len); break;
        case 'u': pushc(result, 0xFA, target_pos, target_len); break;
        default:   /* copy both */
          pushc(result, '\'', target_pos, target_len);
          pushc(result, c,   target_pos, target_len);
          break;
        }
        c = popc(str, src_pos);
      }
      else if (c == '`') {      /* accent grave */
        switch ( c = popc(str, src_pos) ) {
        case 'A': pushc(result, 0xC0, target_pos, target_len); break;
        case 'E': pushc(result, 0xC8, target_pos, target_len); break;
        case 'I': pushc(result, 0xCC, target_pos, target_len); break;
        case 'O': pushc(result, 0xD2, target_pos, target_len); break;
        case 'U': pushc(result, 0xD9, target_pos, target_len); break;
        case 'a': pushc(result, 0xE0, target_pos, target_len); break;
        case 'e': pushc(result, 0xE8, target_pos, target_len); break;
        case 'i': pushc(result, 0xEC, target_pos, target_len); break;
        case 'o': pushc(result, 0xF2, target_pos, target_len); break;
        case 'u': pushc(result, 0xF9, target_pos, target_len); break;
        default:   /* copy both */
          pushc(result, '`', target_pos, target_len);
          pushc(result, c,   target_pos, target_len);
          break;
        }
        c = popc(str, src_pos);
      }
      else if (c == '^') {      /* accent circonflex */
        switch ( c = popc(str, src_pos) ) {
        case 'A': pushc(result, 0xC2, target_pos, target_len); break;
        case 'E': pushc(result, 0xCA, target_pos, target_len); break;
        case 'I': pushc(result, 0xCE, target_pos, target_len); break;
        case 'O': pushc(result, 0xD4, target_pos, target_len); break;
        case 'U': pushc(result, 0xDB, target_pos, target_len); break;
        case 'a': pushc(result, 0xE2, target_pos, target_len); break;
        case 'e': pushc(result, 0xEA, target_pos, target_len); break;
        case 'i': pushc(result, 0xEE, target_pos, target_len); break;
        case 'o': pushc(result, 0xF4, target_pos, target_len); break;
        case 'u': pushc(result, 0xFB, target_pos, target_len); break;
        default:   /* copy both */
          pushc(result, '^', target_pos, target_len);
          pushc(result, c,   target_pos, target_len);
          break;
        }
        c = popc(str, src_pos);
      }
      else if (c == ',') {      /* cedille */
        switch ( c = popc(str, src_pos) ) {
        case 'C': pushc(result, 0xC7, target_pos, target_len); break;
        case 'c': pushc(result, 0xE7, target_pos, target_len); break;
        default:   /* copy both */
          pushc(result, ',', target_pos, target_len);
          pushc(result, c,   target_pos, target_len);
          break;
        }
        c = popc(str, src_pos);
      }
      else if (c == '~') {
        switch ( c = popc(str, src_pos) ) {
        case 'N': pushc(result, 0xD1, target_pos, target_len); break;
        case 'n': pushc(result, 0xF1, target_pos, target_len); break;
        default:   /* copy both */
          pushc(result, '~', target_pos, target_len);
          pushc(result, c,   target_pos, target_len);
          break;
        }
        c = popc(str, src_pos);
      }
      else /* copy both */ {
        pushc(result, '\\', target_pos, target_len);
        pushc(result, c, target_pos, target_len);
        c = popc(str, src_pos);
      }
    }
  }
  
endloop:
  result[target_pos] = '\0';

  return result;
}


/**
 * Decode XML entities in a string.
 *
 * This function decodes pre-defined XML entities in string s.
 * It overwrites the input string s and also returns s for convenience.
 *
 * (The entities are &amp;lt; &amp;gt; &amp;amp; &amp;quot; &amp;apos;).
 *
 * TODO -- numeric entities?
 *
 * If passed NULL, it will not fall over - it will just pass NULL back!
 *
 * This function is safe for strings in any encoding. The returned string
 * will be at the same memory location and will always be the same length
 * or shorter after the decoding of entities.
 *
 * @param s  A string to decode.
 * @return   The string (rewritten in situ).
 */
char *
cl_xml_entity_decode(char *s)
{
  char *read, *write;
  if (s != NULL) {
    read = write = s;
    while (*read) {
      if (*read == '&') {
        if (strncmp(read, "&lt;", 4) == 0) {
          *(write++) = '<';
          read += 4;
        }
        else if (strncmp(read, "&gt;", 4) == 0) {
          *(write++) = '>';
          read += 4;
        }
        else if (strncmp(read, "&amp;", 5) == 0) {
          *(write++) = '&';
          read += 5;
        }
        else if (strncmp(read, "&quot;", 6) == 0) {
          *(write++) = '"';
          read += 6;
        }
        else if (strncmp(read, "&apos;", 6) == 0) {
          *(write++) = '\'';
          read += 6;
        }
        else {
          *(write++) = *(read++); /* no known entity after all  */
        }
      }
      else {
        *(write++) = *(read++); /* simply copy char */
      }
    } /* endwhile */
    *write = '\0';              /* terminate result string */
  }
  return s;
}


/**
 * Replacement for strcpy that won't copy more than CL_MAX_LINE_LENGTH characters.
 *
 * This is intended to make it easier to evade buffer overflows. But it doesn't
 * protect against the opposite danger of losing important data from the end of
 * a truncated string.
 *
 * Note, buffer overflow is still possible if buf is a pointer to the middle
 * of a buffer.
 *
 * So this function is not a panacea, it's just a bit of a help.
 *
 * It's also implemented in a way that is safe for down-strcpying, that is, if
 * we are erasing a section from the start/middle of the string - cl_strcpy(string,
 * string+3); for instance). The POSIX standard states that the normal strcpy
 * has undefined behaviour if the objects overlap. That's not the case here.
 *
 * @param buf  A string buffer to copy to.
 * @param src  The string pointer to copy from.
 * @return     In classic strcpy-stylie, this function uselessly returns buf.
 */
char *
cl_strcpy(char *buf, const char *src)
{
  int i;
  for (i = 0 ; i < CL_MAX_LINE_LENGTH && (buf[i] = src[i]) != '\0' ; i++)
    ;
  /* if we ran out of buffer space, make sure the string is null-terminated */
  if (i == CL_MAX_LINE_LENGTH)
    buf[CL_MAX_LINE_LENGTH-1] = '\0';
  return buf;
}




/*
 *
 * CL AutoString -- automagically-expanding string!
 *
 * Additional features: cached length allows speedier concat.
 */

/**
 * Creates a new autostring object. The string is initialised to data (or to a zero-length string if data is NULL).
 *
 * Initially, init_bytes is allocated (and the increment step is the same size), unless the string is longer...
 * in which case the length of the string becomes the inital amount of memory allocated.
 *
 * Use 0 for init_len, and the length of the specified string is used as the initial allocation.
 */
ClAutoString
cl_autostring_new(const char *data, size_t init_bytes)
{
  ClAutoString s;
  int len;

  /* calculate initial size of data */
  if (1 > init_bytes)
    init_bytes = CL_MAX_LINE_LENGTH;
  if (data)
    if ( (len = 1 + strlen(data)) > init_bytes)
       init_bytes = len;

  s = cl_malloc(sizeof(struct ClAutoString));

  s->data = (char *)cl_malloc(init_bytes);
  s->bytes_allocated = init_bytes;
  s->increment = init_bytes;

  if (data) {
    s->len = len;
    strcpy(s->data, data);
  }
  else {
    s->len = 0;
    s->data[0] = '\0';
  }

  return s;
}

/**
 * Delete an autostring object.
 */
void
cl_autostring_delete(ClAutoString string)
{
  if (NULL == string)
    return;
  cl_free(string->data);
  cl_free(string);
}


/**
 * Changes the increment size (measured in bytes).
 *
 * Whenever memory reallocation is necessary,
 * the AutoString will request a multiple of its increment value.
 */
void
cl_autostring_set_increment(ClAutoString string, size_t new_increment)
{
  if (NULL == string)
    return;
  string->increment = new_increment;
}

/**
 * Get a pointer to the string data inside the AutoString (or NULL if the object is NULL).
 *
 * Equivalent to reading the ->data member, except this function checks for a NULL!
 */
char *
cl_autostring_ptr(ClAutoString string)
{
  if (NULL == string)
    return NULL;
  return string->data;
}

/**
 * Get the length of the currently-stored string (or negative value in case NULL object is passed).
 *
 * Equivalent to reading the ->len member, except this function checks for a NULL!
 */
size_t
cl_autostring_len(ClAutoString string)
{
  if (NULL == string)
    return 0;
  return string->len;
}

/**
 * Tries to free up unused memory by making the AutoString use only as many increments of size as necessary.
 */
void
cl_autostring_reclaim_mem(ClAutoString string)
{
  if (NULL == string)
    return;
  string->data = cl_realloc(string->data, 1 + ( (string->len + 1) / string->increment ));
}

/**
 * Copy the string in src into the AutoString in dst, automatically reallocating memory if necessary.
 */
void
cl_autostring_copy(ClAutoString dst, const char *src)
{
  size_t bytes_needed;

  if (NULL == dst)
    return;

  if (NULL == src) {
    dst->data[0] = '\0';
    dst->len = 0;
  }
  else {
    bytes_needed = 1 + strlen(src);

    if (bytes_needed > dst->bytes_allocated) {
      dst->bytes_allocated = dst->increment * (1 + (bytes_needed / dst->increment));
      dst->data = cl_realloc(dst->data, dst->bytes_allocated);
    }

    strcpy(dst->data, src);
    dst->len = bytes_needed - 1;
  }
}

/**
 * Concatenate the string src onto the end of the AutoString in dst, automatically reallocating memory if necessary.
 */
void
cl_autostring_concat(ClAutoString dst, const char *src)
{
  int bytes_needed;
  char *c;

  if (NULL == dst || NULL == src)
    return;

  bytes_needed = dst->len + strlen(src) + 1;

  if (bytes_needed > dst->bytes_allocated) {
    dst->bytes_allocated = dst->increment * (1 + (bytes_needed / dst->increment));
    dst->data = cl_realloc(dst->data, dst->bytes_allocated);
  }

  c = dst->data + dst->len;
  while ( *src )
    *c++ = *src++;
  *c = '\0';

  dst->len = bytes_needed - 1;
}

/**
 * Truncates the AutoString to the length specified. Note, does not respect UTF-8 encoding,
 * so if the string is UTF8 you need to ascertain in advance that the cut-off does not break
 * any UTF-8 characters into bits.
 *
 * This function should be used if the character buffer is tampered with by direct access (which
 * of course will not update the internal member of the object that tracks string length....).
 */
void
cl_autostring_truncate(ClAutoString string, int new_length)
{
  if (NULL == string)
    return;
  if (new_length > string->len)
    return;
  else {
    string->len = new_length;
    string->data[new_length] = '\0';
  }
}


/**
 * Debug function: dumps the contents of an AutoString to stderr.
 */
void
cl_autostring_dump(ClAutoString string)
{
  Rprintf("CL: Autostring content: \n\t->data %s,"
                  "\n\t->bytes_allocated %ld,\n\t->increment, %ld\n\t->len %ld\n",
                  string->data, string->bytes_allocated, string->increment, string->len);
}

