Logo Search packages:      
Sourcecode: fhist version File versions

wstr.c

/*
 *    fhist - file history and comparison tools
 *    Copyright (C) 1998-2000, 2002 Peter Miller;
 *    All rights reserved.
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
 *
 *
 * MANIFEST: wide string manipulation functions
 *
 * A literal pool is maintained.  Each string has a reference count.  The
 * string stays in the literal pool for as long as it has a positive
 * reference count.  To determine if a string is already in the literal pool,
 * linear dynamic hashing is used to guarantee an O(1) search.  Making all equal
 * strings the same item in the literal pool means that string equality is
 * a pointer test, and thus very fast.
 */

#include <ac/limits.h>
#include <ac/stdarg.h>
#include <ac/stddef.h>
#include <ac/stdio.h>
#include <ac/stdlib.h>
#include <ac/string.h>
#include <ac/wchar.h>
#include <ac/wctype.h>

#include <error.h>
#include <language.h>
#include <mem.h>
#include <str.h>
#include <trace.h> /* for assert */
#include <wstr.h>


static      wstring_ty  **hash_table;
static      wstr_hash_ty      hash_modulus;
static      wstr_hash_ty      hash_cutover;
static      wstr_hash_ty      hash_cutover_mask;
static      wstr_hash_ty      hash_cutover_split_mask;
static      wstr_hash_ty      hash_split;
static      wstr_hash_ty      hash_load;
static      int         changed;

#define MAX_HASH_LEN 20


/*
 * NAME
 *    hash_generate - hash string to number
 *
 * SYNOPSIS
 *    wstr_hash_ty hash_generate(wchar_t *s, size_t n);
 *
 * DESCRIPTION
 *    The hash_generate function is used to make a number from a string.
 *
 * RETURNS
 *    wstr_hash_ty - the magic number
 *
 * CAVEAT
 *    Only the last MAX_HASH_LEN characters are used.
 *    It is important that wstr_hash_ty be unsigned (int or long).
 */

static wstr_hash_ty
hash_generate(const wchar_t *s, size_t n)
{
      wstr_hash_ty      retval;

      if (n > MAX_HASH_LEN)
      {
            s += n - MAX_HASH_LEN;
            n = MAX_HASH_LEN;
      }

      retval = 0;
      while (n > 0)
      {
            retval = (retval + (retval << 1)) ^ *s++;
            --n;
      }
      return retval;
}


/*
 * NAME
 *    wstr_initialize - start up string table
 *
 * SYNOPSIS
 *    void wstr_initialize(void);
 *
 * DESCRIPTION
 *    The wstr_initialize function is used to create the hash table and
 *    initialize it to empty.
 *
 * RETURNS
 *    void
 *
 * CAVEAT
 *    This function must be called before any other defined in this file.
 */

static void
wstr_initialize(void)
{
      wstr_hash_ty      j;

      if (hash_modulus)
            return;
      hash_modulus = 1<<8; /* MUST be a power of 2 */
      hash_cutover = hash_modulus;
      hash_split = hash_modulus - hash_cutover;
      hash_cutover_mask = hash_cutover - 1;
      hash_cutover_split_mask = (hash_cutover * 2) - 1;
      hash_load = 0;
      hash_table = (wstring_ty **)mem_alloc(hash_modulus * sizeof(wstring_ty *));
      for (j = 0; j < hash_modulus; ++j)
            hash_table[j] = 0;
}


/*
 * NAME
 *    split - reduce table loading
 *
 * SYNOPSIS
 *    void split(void);
 *
 * DESCRIPTION
 *    The split function is used to reduce the load factor on the hash table.
 *
 * RETURNS
 *    void
 *
 * CAVEAT
 *    A load factor of about 80% is suggested.
 */

static void
split(void)
{
      wstring_ty  *p;
      wstring_ty  *p2;
      wstr_hash_ty      idx;

      /*
       * get the list to be split across buckets
       */
      p = hash_table[hash_split];
      hash_table[hash_split] = 0;

      /*
       * increase the modulus by one
       */
      hash_modulus++;
      hash_table =
            mem_change_size(hash_table, hash_modulus * sizeof(wstring_ty *));
      hash_table[hash_modulus - 1] = 0;
      hash_split = hash_modulus - hash_cutover;
      if (hash_split >= hash_cutover)
      {
            hash_cutover = hash_modulus;
            hash_split = 0;
            hash_cutover_mask = hash_cutover - 1;
            hash_cutover_split_mask = (hash_cutover * 2) - 1;
      }

      /*
       * now redistribute the list elements
       */
      while (p)
      {
            p2 = p;
            p = p->wstr_next;

            idx = p2->wstr_hash & hash_cutover_mask;
            if (idx < hash_split)
                  idx = p2->wstr_hash & hash_cutover_split_mask;
            p2->wstr_next = hash_table[idx];
            hash_table[idx] = p2;
      }
}


/*
 * NAME
 *    wstr_from_c - make string from C string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_from_c(char *);
 *
 * DESCRIPTION
 *    The wstr_from_c function is used to make a string from a NUL
 *    terminated C string.  The conversion from multi-byte to wide
 *    characters is done in the current locale.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_from_c(const char *s)
{
      return wstr_n_from_c(s, strlen(s));
}


/*
 * NAME
 *    wstr_from_wc - make string from a wide C string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_from_wc(wchar_t *);
 *
 * DESCRIPTION
 *    The wstr_from_c function is used to make a string from a NUL
 *    terminated wide C string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_from_wc(const wchar_t *ws)
{
      return wstr_n_from_wc(ws, wcslen(ws));
}


/*
 * NAME
 *    wstr_n_from_c - make string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_n_from_c(char *s, size_t n);
 *
 * DESCRIPTION
 *    The wstr_n_from_c function is used to make a string from an
 *    array of characters.  No NUL terminator is assumed.  The
 *    conversion from muti-byte to wide characters is done in the
 *    current locale.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_n_from_c(const char *s, size_t length)
{
#if __STDC__ >= 1
      static char escapes[] = "\aa\bb\ff\nn\rr\tt\vv";
#else
      static char escapes[] = "\bb\ff\nn\rr\tt";
#endif
      static wchar_t    *buf;
      static size_t     bufmax;
      size_t            remainder;
      const char  *ip;
      wchar_t           *op;

      /*
       * Do the conversion "long hand".  This is because some
       * implementations of the mbstowcs function barf when they see
       * invalid multi byte character sequences.  This function
       * renders them as C escape sequences and keeps going.
       */
      if (bufmax < length)
      {
            bufmax = length;
            /* the 4 is the longest escape sequence */
            buf = mem_change_size(buf, bufmax * sizeof(wchar_t) * 4);
      }

      /*
       * change the locale to the native language default
       */
      language_human();

      /*
       * Reset the mbtowc internal state.
       */
      mbtowc((wchar_t *)0, (char *)0, 0);

      /*
       * scan the string and extract the wide characters
       */
      ip = s;
      op = buf;
      remainder = length;
      while (remainder > 0)
      {
            int         n;

            n = mbtowc(op, ip, remainder);
            if (n == 0)
                  break;
            if (n < 0)
            {
                  char  *esc;

                  /*
                   * Invalid multi byte sequence, replace the
                   * first character with a C escape sequence.
                   */
                  esc = strchr(escapes, *ip);
                  if (esc)
                  {
                        *op++ = '\\';
                        *op++ = esc[1];
                  }
                  else
                  {
                        *op++ = '\\';
                        *op++ = '0' + ((*ip >> 6) & 7);
                        *op++ = '0' + ((*ip >> 3) & 7);
                        *op++ = '0' + ( *ip       & 7);
                  }
                  ++ip;
                  --remainder;

                  /*
                   * The mbtowc function's internal state will now
                   * be "error" or broken, or otherwise useless.
                   * Reset it so that we can keep going.
                   */
                  mbtowc((wchar_t *)0, (char *)0, 0);
            }
            else
            {
                  /*
                   * the one wchar_t used n chars
                   */
                  ip += n;
                  remainder -= n;
                  ++op;
            }
      }

      /*
       * change the locale back to the C locale
       */
      language_C();

      /*
       * build the result from the image in ``buf''
       */
      return wstr_n_from_wc(buf, op - buf);
}


/*
 * NAME
 *    wstr_to_mbs - wide string to multi-byte C string
 *
 * SYNOPSIS
 *    void wstr_to_mbs(wstring_ty *s, char **rslt, size_t *rslt_len);
 *
 * DESCRIPTION
 *    The wstr_to_mbs function convers a wide character string into a
 *    multi-byte C string.  The conversion is done in the current
 *    locale.  The result is NUL terminated, however the result length
 *    does not include the NUL.
 *
 * CAVEAT
 *    DO NOT free the result.  The result will change between calls,
 *    so copy it if you need to keep it.
 */

void
wstr_to_mbs(wstring_ty *s, char **result_p, size_t *result_length_p)
{
      static char *buf;
      static size_t     bufmax;
      int         n;
      wchar_t           *ip;
      size_t            remainder;
      char        *op;
      size_t            buflen;

      /*
       * Do the conversion "long hand".  This is because the wcstombs
       * function barfs when it sees an invalid wchar_t.  This
       * function treats them literally and keeps going.
       *
       * MB_LEN_MAX is used because the standard guarantees that it is
       * a non-zero constant.  MB_CUR_MAX is an expression, and
       * experience has shown (e.g. SunOS 4.1.3) this it may be zero!
       */
      buflen = (s->wstr_length + 1) * MB_LEN_MAX;
      assert(buflen > 0);
      if (buflen > bufmax)
      {
            bufmax = buflen;
            buf = mem_change_size(buf, bufmax);
      }

      /*
       * perform the conversion in the native language default
       */
      language_human();

      /*
       * The wctomb function has internal state.  It needs to be reset.
       */
      wctomb((char *)0, (wchar_t)0);

      ip = s->wstr_text;
      remainder = s->wstr_length;
      op = buf;
      while (remainder > 0)
      {
            n = wctomb(op, *ip);
            if (n <= 0)
            {
                  /*
                   * Copy the character literally.
                   * Throw away anything that will not fit.
                   */
                  *op++ = *ip++;
                  if (!op[-1])
                        op[-1] = '?';
                  --remainder;

                  /*
                   * The wctomb function's internal state will now
                   * be "error" or broken, or otherwise useless.
                   * Reset it so that we can keep going.
                   */
                  wctomb((char *)0, (wchar_t)0);
            }
            else
            {
                  op += n;
                  ++ip;
                  --remainder;
            }
      }

      /*
       * The final NUL could require shift state end characters,
       * meaning that n could be more than 1.
       */
      n = wctomb(op, (wchar_t)0);
      if (n <= 0)
            *op = 0;
      else
      {
            op += n - 1;
            assert(*op == 0);
      }

      /*
       * restore the locale to the C locale
       */
      language_C();

      /*
       * set the output side effects
       */
      *result_p = buf;
      *result_length_p = op - buf;
}


/*
 * NAME
 *    wstr_n_from_wc - make string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_n_from_wc(wchar_t *s, size_t n);
 *
 * DESCRIPTION
 *    The wstr_n_from_c function is used to make a string from an
 *    array of wide characters.  No NUL terminator is assumed.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_n_from_wc(const wchar_t *s, size_t length)
{
      wstr_hash_ty      hash;
      wstr_hash_ty      idx;
      wstring_ty  *p;

      if (!hash_modulus)
            wstr_initialize();
      hash = hash_generate(s, length);

      idx = hash & hash_cutover_mask;
      if (idx < hash_split)
            idx = hash & hash_cutover_split_mask;

      for (p = hash_table[idx]; p; p = p->wstr_next)
      {
            if
            (
                  p->wstr_hash == hash
            &&
                  p->wstr_length == length
            &&
                  !memcmp(p->wstr_text, s, length * sizeof(wchar_t))
            )
            {
                  p->wstr_references++;
                  return p;
            }
      }

      p = (wstring_ty *)mem_alloc(sizeof(wstring_ty) + length * sizeof(wchar_t));
      p->wstr_hash = hash;
      p->wstr_length = length;
      p->wstr_references = 1;
      p->wstr_next = hash_table[idx];
      hash_table[idx] = p;
      memcpy(p->wstr_text, s, length * sizeof(wchar_t));
      p->wstr_text[length] = 0;

      hash_load++;
      while (hash_load * 10 > hash_modulus * 8)
            split();
      ++changed;
      return p;
}


/*
 * NAME
 *    wstr_copy - make a copy of a string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_copy(wstring_ty *s);
 *
 * DESCRIPTION
 *    The wstr_copy function is used to make a copy of a string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use wstr_free when
 *    finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_copy(wstring_ty *s)
{
      s->wstr_references++;
      return s;
}


/*
 * NAME
 *    wstr_free - release a string
 *
 * SYNOPSIS
 *    void wstr_free(wstring_ty *s);
 *
 * DESCRIPTION
 *    The wstr_free function is used to indicate that a string hash been
 *    finished with.
 *
 * RETURNS
 *    void
 *
 * CAVEAT
 *    This is the only way to release strings DO NOT use the free function.
 */

void
wstr_free(wstring_ty *s)
{
      wstr_hash_ty      idx;
      wstring_ty  **spp;

      if (!s)
            return;
      if (s->wstr_references > 1)
      {
            s->wstr_references--;
            return;
      }
      ++changed;

      /*
       * find the hash bucket it was in,
       * and remove it
       */
      idx = s->wstr_hash & hash_cutover_mask;
      if (idx < hash_split)
            idx = s->wstr_hash & hash_cutover_split_mask;
      for (spp = &hash_table[idx]; *spp; spp = &(*spp)->wstr_next)
      {
            if (*spp == s)
            {
                  *spp = s->wstr_next;
                  free(s);
                  --hash_load;
                  return;
            }
      }

      /*
       * should never reach here!
       */
      fatal_raw("attempted to free non-existent wstring (bug)");
}


/*
 * NAME
 *    wstr_catenate - join two strings
 *
 * SYNOPSIS
 *    wstring_ty *wstr_catenate(wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *    The wstr_catenate function is used to concatenate two strings to form a
 *    new string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use wstr_free when
 *    finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_catenate(wstring_ty *s1, wstring_ty *s2)
{
      static wchar_t    *tmp;
      static size_t     tmplen;
      wstring_ty  *s;
      size_t            length;

      length = s1->wstr_length + s2->wstr_length;
      if (length > tmplen)
      {
            tmplen = length;
            tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t));
      }
      memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
      memcpy(tmp + s1->wstr_length, s2->wstr_text, s2->wstr_length * sizeof(wchar_t));
      s = wstr_n_from_wc(tmp, length);
      return s;
}


/*
 * NAME
 *    wstr_cat_three - join three strings
 *
 * SYNOPSIS
 *    wstring_ty *wstr_cat_three(wstring_ty *, wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *    The wstr_cat_three function is used to concatenate three strings to form
 *    a new string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use wstr_free when
 *    finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_cat_three(wstring_ty *s1, wstring_ty *s2, wstring_ty *s3)
{
      static wchar_t    *tmp;
      static size_t     tmplen;
      wstring_ty  *s;
      size_t            length;

      length = s1->wstr_length + s2->wstr_length + s3->wstr_length;
      if (tmplen < length)
      {
            tmplen = length;
            tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t));
      }
      memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
      memcpy
      (
            tmp + s1->wstr_length,
            s2->wstr_text,
            s2->wstr_length * sizeof(wchar_t)
      );
      memcpy
      (
            tmp + s1->wstr_length + s2->wstr_length,
            s3->wstr_text,
            s3->wstr_length * sizeof(wchar_t)
      );
      s = wstr_n_from_wc(tmp, length);
      return s;
}


wstring_ty *
wstr_to_upper(const wstring_ty *ws)
{
      static wchar_t    *buffer;
      static size_t     buflen;
      size_t            j;

      if (ws->wstr_length > buflen)
      {
            buflen = ws->wstr_length;
            buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
      }
      language_human();
      for (j = 0; j < ws->wstr_length; ++j)
      {
            wchar_t           c;

            c = ws->wstr_text[j];
            if (iswlower(c))
                  c = towupper(c);
            buffer[j] = c;
      }
      language_C();
      return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_lower(const wstring_ty *ws)
{
      static wchar_t    *buffer;
      static size_t     buflen;
      size_t            j;

      if (ws->wstr_length > buflen)
      {
            buflen = ws->wstr_length;
            buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
      }
      language_human();
      for (j = 0; j < ws->wstr_length; ++j)
      {
            wchar_t           c;

            c = ws->wstr_text[j];
            if (iswupper(c))
                  c = towlower(c);
            buffer[j] = c;
      }
      language_C();
      return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_ident(const wstring_ty *ws)
{
      static wchar_t    *buffer;
      static size_t     buflen;
      size_t            j;

      if (ws->wstr_length == 0)
            return wstr_from_c("_");
      if (ws->wstr_length > buflen)
      {
            buflen = ws->wstr_length;
            buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
      }
      language_human();
      for (j = 0; j < ws->wstr_length; ++j)
      {
            wchar_t           c;

            c = ws->wstr_text[j];
            if (!iswalnum(c))
                  c = '_';
            buffer[j] = c;
      }
      if (iswdigit(buffer[0]))
            buffer[0] = '_';
      language_C();
      return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
str_to_wstr(string_ty *s)
{
      return wstr_n_from_c(s->str_text, s->str_length);
}


string_ty *
wstr_to_str(wstring_ty *ws)
{
      char        *text;
      size_t            length;

      wstr_to_mbs(ws, &text, &length);
      return str_n_from_c(text, length);
      trace(("to silence warnings\n"));
}


/*
 * NAME
 *    wstr_equal - test equality of strings
 *
 * SYNOPSIS
 *    int wstr_equal(wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *    The wstr_equal function is used to test if two strings are equal.
 *
 * RETURNS
 *    int; zero if the strings are not equal, nonzero if the strings are
 *    equal.
 *
 * CAVEAT
 *    This function is implemented as a macro in strings.h
 */

#ifdef wstr_equal
#undef wstr_equal
#endif

int
wstr_equal(wstring_ty *s1, wstring_ty *s2)
{
      return (s1 == s2);
}

Generated by  Doxygen 1.6.0   Back to index