/*

------------------------------------------------------------------------------

A license is hereby granted to reproduce this software source code and
to create executable versions from this source code for personal,
non-commercial use.  The copyright notice included with the software
must be maintained in all copies produced.

THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE.  THE
AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.

Copyright (c) 1995, 1996, John Conover, All Rights Reserved.

Comments and/or bug reports should be addressed to:

    john@johncon.com (John Conover)

------------------------------------------------------------------------------

uppercase.c, uppercase transliteration

unsigned char *make_uppercase (void);

and:

unsigned char *uppercase = (unsigned char *) 0;

    allocate a global array, of size MAX_ALPHABET_SIZE, and of type
    unsigned char, named uppercase[], constructed in such a manner
    that the implicit index of any element in the array contains the
    toupper() of the index value (ie., it is a look up table for
    uppercase characters.)

    note: the requirement of bmhsearch() in bmhsearch.c that the '\0'
    character is reserved as an end of search sentinel in the
    pattern-this means that array element 0 can NOT contain a '\0'-a
    space will be used

    note: care must be exercised when using this array in systems
    where the native type of char is signed, for example:

        signed char ch;

        unsigned char cu;

        cu = uppercase[ch];

    will not give the desired results, since ch indexed a negative
    section of the array, (which does not exist.). Particularly
    meticulous usage of lint is advisable.

    The objective of this technique is to provide an alternative to
    using toupper() on every character in large documents-implicit
    indexing is very fast, and once the uppercase array has been set
    up, uppercase transliteration of documents can be made very
    quickly. As a related issue, it should be, relatively, portable to
    other locale.h environments. It is probably important to note that
    the characters in the infix search criteria and any documents
    should both be transliterated using this array so that the
    character sets for both are identical.

    Other transliterations can be placed in the array, for example,
    tabs and newlines can be converted to spaces, (which is the
    current implementation,) as could punctuation such as commas and
    periods. A possible scenario would be to use ispunct(), iswhite(),
    isprint(), etc. in the for loop to alter the transliteration to
    whatever is desired-be advised that portability issues may ensue
    if the scheme is not compatible with local.h in some languages. A
    possible alternative would be to implement a command line switch
    for various languages, to avoid localization and portability
    conflicts.

The algorithm is as follows:

    allocate space for the array

    for each element in the array, store the toupper() of the index of
    the element

Usage is a single call to allocate and scan the array, for example:

    unsigned char my_array_of_uppercase[],
                  *my_ptr;

    if (make_uppercase () == (unsigned char *) 0)
    {
        (void) printf ("error installing uppercase array\n");
    }

    while (something)
    {
        my_array_of_uppercase[something] = uppercase[(int) *my_ptr];
    }

For a detailed description of using implicit addressing for character
transliteration, see "Information Retrieval: Data Structures &
Algorithms," William B. Frakes, Ricardo Baeza-Yates, Editors, Prentice
Hall, Englewood Cliffs, New Jersey, 1992, ISBN 0-13-463837-9, pp 102.

There are no arguments

On any error, return null, else return a reference to the array of
uppercase letters, uppercase[]

MAX_ALPHABET_SIZE is defined in uppercase.h

To test this module, compile the module source with -DTEST_UPPERCASE

$Revision: 1.2 $
$Date: 1996/09/13 13:47:23 $
$Id: uppercase.c,v 1.2 1996/09/13 13:47:23 john Exp $
$Log: uppercase.c,v $
Revision 1.2  1996/09/13 13:47:23  john
Added handling of circularly linked directories and subdirectories in searchpath.c
Cosmetic changes to bmhsearch.c, postfix.c, rel.c, searchfile.c, translit.c, uppercase.c, version.c.

Revision 1.1  1996/02/08 02:55:10  john
Added hyphenation, backspace, and multiple whitespace capability.
Changes to files: uppercase.c translit.c searcfile.c rel.c and version.c-required for hyphenation, backspace, and multiple whitespace capability.

 * Revision 1.0  1995/04/22  05:13:18  john
 * Initial revision
 *

*/

#include "rel.h"

#ifndef LINT /* include rcsid only if not running lint */

static char rcsid[] = "$Id: uppercase.c,v 1.2 1996/09/13 13:47:23 john Exp $"; /* module version */
static char rcsid_h[] = UPPERCASE_H_ID; /* module include version */

#endif

unsigned char *uppercase = (unsigned char *) 0; /* reference to uppercase array */

/*

Note: all whitespace characters, as determined by the function
isspace(3), are considered spaces. The whitespace characters are
derived from locale.h and ctype.h and is locale specific. The single
or multiple whitespace characters are used to address phrase searching
issues in translit.c. In the "C" locale, the set of whitespace
characters are the space, the form feed, new-line, carriage return,
horizontal tab, and vertical tab. In other locales, other
implementation-defined characters may be added to this set, provided
they do not test true for isalnum(3).

*/

#ifdef __STDC__

unsigned char *make_uppercase (void)

#else

unsigned char *make_uppercase ()

#endif

{
    int uppercase_error = URMEM_ERR, /* module error value, assume error allocating memory */
        i;

    if ((uppercase = (unsigned char *) memalloc (MAX_ALPHABET_SIZE * sizeof (unsigned char))) != (unsigned char *) 0) /* allocate */
    {
        uppercase_error = NO_ERROR; /* assume no errors */
        uppercase[0] = (unsigned char) ' '; /* the null character is reserved as an end of search sentinel */

        for (i = 1; i < MAX_ALPHABET_SIZE; i++) /* for each remaining character in the uppercase array */
        {

            if (isspace (i)) /* character whitespace? */
            {
                uppercase[i] = (unsigned char) ' '; /* yes, make sure the character is a space */
            }

            else
            {
                uppercase[i] = (unsigned char) toupper (i); /* no, convert the character to uppercase */
            }

        }

    }

    if (uppercase_error != NO_ERROR) /* pending error? */
    {
        message (uppercase_error, (char *) 0); /* yes, print the error */
    }

    return (uppercase); /* return a reference to the uppercase array, null if error */
}

#ifdef TEST_UPPERCASE

/*

simple exerciser for testing make_uppercase (); dump the array to
stdio

declared global, could be static
    uppercase           uppercase.c(xxx)
    make_uppercase      uppercase.c(yyy)

from lint

*/

#ifdef __STDC__

int main (void)

#else

int main ()

#endif

{
    int i; /* character counter */

    if (make_uppercase () == (unsigned char *) 0) /* setup the uppercase array */
    {
        (void) fprintf (stderr, "error allocating uppercase array\n"); /* couldn't setup the uppercase array, print the error */
        exit (1); /* and, exit */
    }

    for (i = 0; i < MAX_ALPHABET_SIZE; i++) /* for each character in the uppercase array */
    {
        (void) printf ("uppercase[%d] = %d\n", i, (int) uppercase[i]); /* print the character's decimal value to stdio */
    }

    exit (0); /* return success */

#ifdef LINT /* include only if running lint */

    return (0); /* for LINT formality */

#endif

}

#endif
