Header And Logo

PostgreSQL
| The world's most advanced open source database.

Data Structures | Typedefs | Functions | Variables

unaccent.c File Reference

#include "postgres.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
Include dependency graph for unaccent.c:

Go to the source code of this file.

Data Structures

struct  SuffixChar

Typedefs

typedef struct SuffixChar SuffixChar

Functions

static SuffixCharplaceChar (SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
static SuffixCharinitSuffixTree (char *filename)
static SuffixCharfindReplaceTo (SuffixChar *node, unsigned char *src, int srclen)
 PG_FUNCTION_INFO_V1 (unaccent_init)
Datum unaccent_init (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (unaccent_lexize)
Datum unaccent_lexize (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (unaccent_dict)
Datum unaccent_dict (PG_FUNCTION_ARGS)

Variables

 PG_MODULE_MAGIC

Typedef Documentation

typedef struct SuffixChar SuffixChar

Function Documentation

static SuffixChar* findReplaceTo ( SuffixChar node,
unsigned char *  src,
int  srclen 
) [static]

Definition at line 202 of file unaccent.c.

References SuffixChar::nextChar.

Referenced by unaccent_lexize().

{
    while (node)
    {
        node = node + *src;
        if (srclen == 1)
            return node;

        src++;
        srclen--;
        node = node->nextChar;
    }

    return NULL;
}

static SuffixChar* initSuffixTree ( char *  filename  )  [static]

Definition at line 78 of file unaccent.c.

References CopyErrorData(), CurrentMemoryContext, ereport, errcode(), errmsg(), ERROR, FlushErrorState(), get_tsearch_config_filename(), MemoryContextSwitchTo(), NULL, pfree(), PG_CATCH, PG_END_TRY, pg_mblen(), PG_RE_THROW, PG_TRY, placeChar(), ErrorData::sqlerrcode, t_isspace, tsearch_readline(), tsearch_readline_begin(), and tsearch_readline_end().

Referenced by unaccent_init().

{
    SuffixChar *volatile rootSuffixTree = NULL;
    MemoryContext ccxt = CurrentMemoryContext;
    tsearch_readline_state trst;
    volatile bool skip;

    filename = get_tsearch_config_filename(filename, "rules");
    if (!tsearch_readline_begin(&trst, filename))
        ereport(ERROR,
                (errcode(ERRCODE_CONFIG_FILE_ERROR),
                 errmsg("could not open unaccent file \"%s\": %m",
                        filename)));

    do
    {
        /*
         * pg_do_encoding_conversion() (called by tsearch_readline()) will
         * emit exception if it finds untranslatable characters in current
         * locale. We just skip such lines, continuing with the next.
         */
        skip = true;

        PG_TRY();
        {
            char       *line;

            while ((line = tsearch_readline(&trst)) != NULL)
            {
                /*
                 * The format of each line must be "src trg" where src and trg
                 * are sequences of one or more non-whitespace characters,
                 * separated by whitespace.  Whitespace at start or end of
                 * line is ignored.
                 */
                int         state;
                char       *ptr;
                char       *src = NULL;
                char       *trg = NULL;
                int         ptrlen;
                int         srclen = 0;
                int         trglen = 0;

                state = 0;
                for (ptr = line; *ptr; ptr += ptrlen)
                {
                    ptrlen = pg_mblen(ptr);
                    /* ignore whitespace, but end src or trg */
                    if (t_isspace(ptr))
                    {
                        if (state == 1)
                            state = 2;
                        else if (state == 3)
                            state = 4;
                        continue;
                    }
                    switch (state)
                    {
                        case 0:
                            /* start of src */
                            src = ptr;
                            srclen = ptrlen;
                            state = 1;
                            break;
                        case 1:
                            /* continue src */
                            srclen += ptrlen;
                            break;
                        case 2:
                            /* start of trg */
                            trg = ptr;
                            trglen = ptrlen;
                            state = 3;
                            break;
                        case 3:
                            /* continue trg */
                            trglen += ptrlen;
                            break;
                        default:
                            /* bogus line format */
                            state = -1;
                            break;
                    }
                }

                if (state >= 3)
                    rootSuffixTree = placeChar(rootSuffixTree,
                                               (unsigned char *) src, srclen,
                                               trg, trglen);

                pfree(line);
            }
            skip = false;
        }
        PG_CATCH();
        {
            ErrorData  *errdata;
            MemoryContext ecxt;

            ecxt = MemoryContextSwitchTo(ccxt);
            errdata = CopyErrorData();
            if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
            {
                FlushErrorState();
            }
            else
            {
                MemoryContextSwitchTo(ecxt);
                PG_RE_THROW();
            }
        }
        PG_END_TRY();
    }
    while (skip);

    tsearch_readline_end(&trst);

    return rootSuffixTree;
}

PG_FUNCTION_INFO_V1 ( unaccent_init   ) 
PG_FUNCTION_INFO_V1 ( unaccent_dict   ) 
PG_FUNCTION_INFO_V1 ( unaccent_lexize   ) 
static SuffixChar* placeChar ( SuffixChar node,
unsigned char *  str,
int  lenstr,
char *  replaceTo,
int  replacelen 
) [static]

Definition at line 42 of file unaccent.c.

References elog, SuffixChar::nextChar, palloc(), SuffixChar::replacelen, SuffixChar::replaceTo, and WARNING.

Referenced by initSuffixTree().

{
    SuffixChar *curnode;

    if (!node)
    {
        node = palloc(sizeof(SuffixChar) * 256);
        memset(node, 0, sizeof(SuffixChar) * 256);
    }

    curnode = node + *str;

    if (lenstr == 1)
    {
        if (curnode->replaceTo)
            elog(WARNING, "duplicate TO argument, use first one");
        else
        {
            curnode->replacelen = replacelen;
            curnode->replaceTo = palloc(replacelen);
            memcpy(curnode->replaceTo, replaceTo, replacelen);
        }
    }
    else
    {
        curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
    }

    return node;
}

Datum unaccent_dict ( PG_FUNCTION_ARGS   ) 
Datum unaccent_init ( PG_FUNCTION_ARGS   ) 

Definition at line 221 of file unaccent.c.

References defGetString(), DefElem::defname, ereport, errcode(), errmsg(), ERROR, initSuffixTree(), lfirst, PG_GETARG_POINTER, PG_RETURN_POINTER, and pg_strcasecmp().

{
    List       *dictoptions = (List *) PG_GETARG_POINTER(0);
    SuffixChar *rootSuffixTree = NULL;
    bool        fileloaded = false;
    ListCell   *l;

    foreach(l, dictoptions)
    {
        DefElem    *defel = (DefElem *) lfirst(l);

        if (pg_strcasecmp("Rules", defel->defname) == 0)
        {
            if (fileloaded)
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("multiple Rules parameters")));
            rootSuffixTree = initSuffixTree(defGetString(defel));
            fileloaded = true;
        }
        else
        {
            ereport(ERROR,
                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                     errmsg("unrecognized Unaccent parameter: \"%s\"",
                            defel->defname)));
        }
    }

    if (!fileloaded)
    {
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                 errmsg("missing Rules parameter")));
    }

    PG_RETURN_POINTER(rootSuffixTree);
}

Datum unaccent_lexize ( PG_FUNCTION_ARGS   ) 

Definition at line 263 of file unaccent.c.

References findReplaceTo(), TSLexeme::flags, TSLexeme::lexeme, palloc(), palloc0(), pg_database_encoding_max_length(), PG_GETARG_INT32, PG_GETARG_POINTER, pg_mblen(), PG_RETURN_POINTER, SuffixChar::replacelen, and SuffixChar::replaceTo.

{
    SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
    char       *srcchar = (char *) PG_GETARG_POINTER(1);
    int32       len = PG_GETARG_INT32(2);
    char       *srcstart,
               *trgchar = NULL;
    int         charlen;
    TSLexeme   *res = NULL;
    SuffixChar *node;

    srcstart = srcchar;
    while (srcchar - srcstart < len)
    {
        charlen = pg_mblen(srcchar);

        node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
        if (node && node->replaceTo)
        {
            if (!res)
            {
                /* allocate res only if it's needed */
                res = palloc0(sizeof(TSLexeme) * 2);
                res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
                res->flags = TSL_FILTER;
                if (srcchar != srcstart)
                {
                    memcpy(trgchar, srcstart, srcchar - srcstart);
                    trgchar += (srcchar - srcstart);
                }
            }
            memcpy(trgchar, node->replaceTo, node->replacelen);
            trgchar += node->replacelen;
        }
        else if (res)
        {
            memcpy(trgchar, srcchar, charlen);
            trgchar += charlen;
        }

        srcchar += charlen;
    }

    if (res)
        *trgchar = '\0';

    PG_RETURN_POINTER(res);
}


Variable Documentation

Definition at line 23 of file unaccent.c.