<copyright> String abstract class.
    Written by <a href="mailto:tiggr@ics.ele.tue.nl">Pieter J. Schoenmakers</a>

    Copyright &copy; 1995-1997 Pieter J. Schoenmakers.

    This file is part of TOM.  TOM is distributed under the terms of the
    TOM License, a copy of which can be found in the TOM distribution; see
    the file LICENSE.

    <id>$Id: String.t,v 1.42 1998/07/21 14:56:09 tiggr Exp $</id>  </copyright>

<c>
#include <stdio.h>
</c>

implementation class
String: Indexed, Comparable

end;

implementation instance
String

<doc> We'll return a {String} when frozen.  </doc>
redeclare deferred String
  frozen;

<doc> Return {YES}.  </doc>
boolean
  dump_simple_p
{
  = TRUE;
}

<doc> Print the receiving string, quoted.  </doc>
// No quoting yet!
// Sat May  3 20:37:19 1997, tiggr@tricky.es.ele.tue.nl
OutputStream
  dump_simple OutputStream s
{
  = [s print ('"', self, '"')];
}

/******************** equality ********************/

<doc> Compare the receiving {String} with the other {String}.  </doc>
redeclare deferred boolean
  equal String other;

<doc> Compare the receiving {String} with the other {ByteString}.  </doc>
deferred boolean
  equalByteString ByteString other;

<doc> Compare the receiving {String} with the other {CharString}.  </doc>
deferred boolean
  equalCharString CharString other;

<doc> Compare the receiving {String} with the other {UniqueString}.  </doc>
deferred boolean
  equalUniqueString UniqueString other;

<doc> Compare the receiving {String} with the other {String}, ignoring
    case differences.  </doc>
boolean
  equalModuloCase String other
{
  if (self == other)
    return YES;

  int i, len = [self length], o_len = [other length];

  if (len != o_len)
    return NO;

  for (; i < len; i++)
    {
      char c = self[i], o_c = other[i];

      if (c != o_c && [self toUpper c] != [other toUpper o_c])
	return NO;
    }

  = YES;
}

/******************** Comparable ********************/

<doc> Compare the receiving {String} with the other.  </doc>
int
  compare id other
{
  int length = [self length];
  int other_len = [other length];
  int min = length > other_len ? other_len : length;
  int i;

  for (i = 0; i < min; i++)
    {
      char c = self[i];
      char d = other[i];

      if (c != d)
	return c > d ? 1 : -1;
    }

  return length > other_len ? 1 : length == other_len ? 0 : -1;
}

/******************** searching ********************/

<doc> Return the range of the occurrence of the {string} in the receiving
    string.  Return a negative length in case it could not be found.  The
    optional {start} and {length} can be specified to restrict the
    searching within the receiving string.  </doc>
(int, int)
  rangeOfString String string
	 range: (int, int) (start, len) = (0, -1)
{
  int i, j, l = [string length];

  (start, len) = [self adjustRange (start, len)];

  if (!l)
    return (start, 0);

  char first = string[0];
  int last = start + len - l;
  for (i = start; i <= last; i++)
    if (first == self[i])
      {
	for (j = 1; j < l; j++)
	  if (self[i + j] != ({char x1 = string[j];}))
	    break;
	if (j == l)
	  return (i, l);
      }

  = (start, -1);
}

/******************** retrieving (sub-) strings ********************/

<doc> Return a (mutable) {Array} of strings, taken from the receiving
    string by splitting it at characters with the indicated {char} value.
    Thus, splitting `/usr/tmp' at each `/' returns an array holding the
    empty string, `usr', and `tmp'.

    The optional argument {limit} specifies the maximum number of items in
    which the caller is interested, or -1 for all items.  For example, if
    `/usr/foo/bar' is split on `/' in 3 items, the array returned contains
    `', `usr', and `foo/bar'.

    The optional argument {excl}, if YES specifies that zero-length
    substrings are not to be included in the result.  Thus, splitting
    `/aap/noot/mies/wim' in 3 items, ignoring empty items, returns an
    array containing `aap', `noot', and `mies/wim'.

    The optional selector {sel} specifies the method to be called to
    extract the substrings from the receiving string.  The default
    selector is {"r_substring_(ii)"}.  To retrieve mutable substrings, the
    selector {"r_mutableSubstring_(ii)"} could be used.  </doc>
MutableArray
  componentsSeparatedBy char c
		 limit: int limit = -1
	  excludeEmpty: boolean excl = NO
     substringSelector: selector sel = selector (String substring (int, int))
{
  MutableArray a = [MutableObjectArray new];
  int length = [self length];
  int s, i;

  for (s = 0; s < length && (limit < 0 || limit > 1); s = i)
    {
      for (i = s; i < length && self[i] != c; i++)
	void;
      if (!excl || i != s)
	{
	  [a add ({All x1 = [self perform sel with (s, i - s)];})];
	  limit--;
	}
      if (++i == length && limit != 0 && !excl)
	{
	  [a add ({All x2 = [self perform sel with (i, 0)];})];
	  limit = 0;
	}
    }

  if (limit == 1 && s < length)
    [a add ({All x3 = [self perform sel with (s, -1)];})];

  = a;
}

<doc> Return a {MutableString} holding the characters from the receiving
    {String} in the (clipped) range ({start}, {len}).  </doc>
deferred MutableString
  mutableSubstring (int, int) (start, len)
pre
  start >= 0 && len >= -1;

<doc> Return a constant {String} holding the characters from the receiving
    {String} in the (clipped) range ({start}, {len}).  </doc>
deferred String
  substring (int, int) (start, len)
pre
  start >= 0 && len >= -1;

<doc> Return a unique version of the receiving string.  Do not use this
    method to create unique strings; use {[UniqueString with my_string]}
    instead.  (This method only creates strings which think they are
    unique; the {UniqueString} class ensures they actually are.)  </doc>
deferred UniqueString
  uniqueString;

/******************** case conversion ********************/

id
  downcase
{
  int i, n = [self length];
  MutableByteString result;

  for (i = 0; i < n; i++)
    {
      char c = self[i];

      if ([self isUpper c])
	{
	  if (!result)
	    result = MutableByteString ([self mutableSubstring (0, -1)]);
	  result[i] = [self toLower c];
	}
    }

  = !result ? self : ({if (![self mutable]) [result freeze]; result;});
}

id
  upcase
{
  int i, n = [self length];
  MutableByteString result;

  for (i = 0; i < n; i++)
    {
      char c = self[i];

      if ([self isLower c])
	{
	  if (!result)
	    result = MutableByteString ([self mutableSubstring (0, -1)]);
	  result[i] = [self toUpper c];
	}
    }

  = !result ? self : ({if (![self mutable]) [result freeze]; result;});
}

/******************** conversions ********************/
<doc><h4>Conversions</h4></doc>

// This should be a frontend to the real method.
// Sun May  4 23:36:18 1997, tiggr@tricky.es.ele.tue.nl
<doc> Return the double value at the start of the string.  </doc>
double (value)
  doubleValue
{
  pointer chars;
  int len;

  // byte oriented...
  // Sun May  4 23:37:00 1997, tiggr@tricky.es.ele.tue.nl
  (chars, len) = [ByteString (self) byteStringContents];

  // With gcc 2.7.2.1 Objective-C snapshot 960906 on m68k-next-nextstep3,
  // the proper format, "%lg", is changed into "%g"!
  // Tue Jun 24 21:18:13 1997, tiggr@tricky.es.ele.tue.nl
<c>
  {
    float v;

    sscanf (chars, "%g", &v);
    value = v;
  }
</c>
}

<doc> Convert the number contained in the receiving string from index
    {start}, running for {len} bytes (which -1 for unlimited length).

    The value returned is a tuple (extracted value, occupied full range,
    actual length).  If the actual length is 0, the extracted number will
    be 0.

    The {base} defaults to 10, but can be any number.  If it is larger
    than 10, alpha characters encountered have the value of 11 + the
    offset from the alpha character to the start of its range.  Thus, `a'
    is 10, `z' is 35.

    Iff {signs}, a leading `+' or `-' sign is accepted.

    Iff {c_bases}, C-style base indicators may be used: a number starting
    with a `0' denotes an octal number; a number starting with `0x' or
    `0X' is a hexadecimal number.

    Iff the {base_separator} is not 0, a number can be prefixed with a
    base indication followed by the base separator to specify the base of
    the actual number to follow.  The base is read using the a decimal
    {base}, unless {decimal_base} is {FALSE}, in which case the base is
    read in the default {base}.  Thus, `10_10', with `_' as a base
    separator, returns {base} if {decimal_base} is {FALSE}, and 10 if it
    was {TRUE}.  </doc>

// Bug: Overflows are not handled.
// GGG The value returned should be a long, but long comparisons are partly
// broken on m68k-next-nextstep3 by GNU CC 2.7.2 (should report this).
// Sat May 18 23:47:27 1996, tiggr@tricky.es.ele.tue.nl
// 
// There should be an accompanying method which first fills the receiving
// (mutable) string with characters read from a stream and then returns
// the number read.  Mon Feb 10 17:59:41 1997, tiggr@akebono.ics.ele.tue.nl
(int, boolean, int) (value, full_range, actual_length)
    integerValue (int, int) (start, len)
    defaultBase: int base = 10
      allowSign: boolean signs = YES
    allowCBases: boolean c_bases = YES
  baseSeparator: byte base_separator = '_'
    decimalBase: boolean decimal_base = YES
{
  int length = [self length];
  boolean neg;
  int i, e;
  char c;

  (start, len) = [self adjustRange (start, len)];

  if (start == length || !len)
    return (0, TRUE, 0);

  if (base < 2)
    return (0, FALSE, 0);

  i = start;
  c = self[i];
  e = start + len;

  if (signs && (c == '-' || c == '+'))
    {
      if (c == '-')
	neg = TRUE;
      if (++i != e)
	c = self[i];
    }

  if (!neg && c_bases && [self isDigit c] && [self digitValue c] == 0)
    {
      base = 8;
      if (++i != e)
	{
	  c = self[i];
	  if ([self isAlpha c] && [self alphaValue c] == 'x' - 'a')
	    {
	      base = 16;
	      if (++i == e)
		{
		  /* `0x' is a `0' followed by the garbage `x'.  */
		  return (0, FALSE, 1);
		}
	      c = self[i];
	    }
	}
    }

  while (i != e)
    {
      if (c == base_separator && base_separator != byte (0))
	{
	  if (decimal_base && base != 10)
	    return [self integerValue (start, len)
			 defaultBase: 10 allowSign: signs allowCBases: c_bases
			 baseSeparator: base_separator decimalBase: YES];

	  /* Consider `10_' a valid number...  */
	  if (++i == e)
	    return (0, TRUE, i - start);

	  boolean b;
	  int j;
	  (value, b, j) = [self integerValue (i, e - i)
				defaultBase: int (value) allowSign: NO
				allowCBases: NO baseSeparator: byte (0)];

	  return (value, b, j + i - start);
	}

      int digit;

      if ([self isDigit c])
	digit = [self digitValue c];
      else if ([self isLower c] || [self isUpper c])
	digit = 10 + [self alphaValue c];
      else
	break;
      if (digit >= base)
	break;

      value = value * base + (neg ? -digit : digit);
      if (++i != e)
	c = self[i];
    }

  = (value, i == e, i - start);
}

<doc> Simple front-end for {integerValue} (with default arguments).  </doc>
int
  intValue
{
  long v;

  (v,,) = [self integerValue (0, -1)];
  = int (v);
}

<doc> Simple front-end for {integerValue}, similar to {intValue}, but not
    allowing a negative value.  For a negative value entered (due to
    {integerValue} not doing overflow checking), 0 is returned.  </doc>
int
  unsignedIntValue
{
  long v;

  (v,,) = [self integerValue (0, -1) allowSign: NO];

  if (v < 0)
    v = 0;

  = int (v);
}

/******************** character predicates ********************/

<doc> Return TRUE iff the character {c} denotes a letter.  </doc>
boolean
  isAlpha char c
{
  = [Unicoding isAlpha c];
}

<doc> Return {TRUE} iff the character {c} is a digit.  </doc>
boolean
  isDigit char c
{
  = [Unicoding isDigit c];
}

<doc> Return {TRUE} iff the character {c} is in lower-case.  </doc>
boolean
  isLower char c
{
  = [Unicoding isLower c];
}

<doc> Return {TRUE} iff the character {c} is a punctuation character.
    </doc>
boolean
  isPunct char c
{
  = [Unicoding isPunct c];
}

<doc> Return {TRUE} iff the character {c} is a space character.  </doc>
boolean
  isSpace char c
{
  = [Unicoding isSpace c];
}

<doc> Return {TRUE} iff the character {c} is in upper-case.  </doc>
boolean
  isUpper char c
{
  = [Unicoding isUpper c];
}

/******************** character conversion ********************/

char
  toLower char c
{
  = [Unicoding toLower c];
}

char
  toTitle char c
{
  = [Unicoding toTitle c];
}

char
  toUpper char c
{
  = [Unicoding toUpper c];
}

int
  digitValue char c
{
  = [Unicoding digitValue c];
}

int
  alphaValue char c
{
  = [Unicoding alphaValue c];
}

/******************** encodings ********************/

<doc> Return a string by decoding it assuming it was encoded using the
    encoding named by {encoding_name}.  The default implementation simply
    returns {self}.  </doc>
id
  stringByDecoding String encoding_name
{
  = self;
}

end;
