Code Search for Developers
 
 
  

splitter.c from TextIndexNG at Krugle


Show splitter.c syntax highlighted

/*
 TextIndexNG V 3                
 The next generation TextIndex for Zope

 This software is governed by a license. See
 LICENSE.txt for the terms of this license.
*/

#include "Python.h"
#include <ctype.h>
#include "dict.h"

#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif

/*
** struct for splitter
*/

typedef struct
{
    PyObject_HEAD
    PyObject *list;                    // list of splitted words
    hashtable *cache;                 
    unsigned char small_cache[256];
    int max_len;                       // maximum length of a word
    int single_chars;                  // allow single character words
    int casefolding;                   // mapping all to lower case
}
Splitter;


/*
** Node handling for hashtable.
** 
** We maintain a hashtable to store (key,value) pairs
** where the keys are the int value of a character or unicode 
** characters. The value is a flag indicating if this character
** is an alphanumeric character or a valid word separator.
** For characters < 255 we maintain a simple array for performance
** reasons. 
** 
** The reason of the hashtable is to cache lookups to Python calls
** like Py_Unicode_ISALNUM()/
*/


#define MISS 0                       // Cache miss

#define IS_TRASH 1                   // not a valid word separator
#define IS_ALNUM 2                   // alphanumeric character
#define IS_SEPARATOR 3               // valid word separator
#define UNSET 255                    // default value

#define HASHTABLESIZE 4096           // size of hashtable


typedef struct
{
    int value;
}
INODE ;


// Create a new node 

INODE * new_inode(int v)
{
    INODE * n;

    n = (INODE *) malloc(sizeof(INODE));
    n->value = v;
    return n;
}

// delete a node 

int del_inode(void * node)
{
    free(node);
    return 1;
}

// compare the values of two nodes

int inode_cmp(void *n1, void *n2)
{
    int v1,v2;

    v1 = ((INODE *) n1)->value;
    v2 = ((INODE *) n2)->value;

    if (v1 < v2)
        return -1;
    else if (v1 > v2)
        return 1;
    else
        return 0;
}

// get the value for a given key

int inode_get(Splitter * self, int v)
{
    INODE * result;
    INODE tmp;

    if (v < 256)
        return self->small_cache[v]==UNSET ? MISS : self->small_cache[v];

    tmp.value = v;
    result = hashtable_search(self->cache, &tmp);

    if (result)
        return  result->value;
    else
        return MISS;
}

// insert a new (key,value) pair 

int inode_set(Splitter * self, int k, int v)
{
    if (k < 256)
        self->small_cache[k] = v;
    else
        hashtable_insert_txng(self->cache, new_inode(k), new_inode(v), 0);

    return 1;
}

// calculate the hash value for a node 

unsigned inode_hash(void *node)
{
    int v;

    v = ((INODE *) node)->value;
    return  v % HASHTABLESIZE;
}

/* 
** Here starts the splitter dance 
*/

static void
Splitter_dealloc(Splitter *self)
{
    hashtable_destroy(self->cache, 1);
    Py_XDECREF(self->list);
    PyMem_DEL(self);
}


/*
** Dummy functions 
*/

static PyObject *
Splitter_concat(Splitter *self, PyObject *other)
{
    PyErr_SetString(PyExc_TypeError, "Cannot concatenate Splitters.");
    return NULL;
}

static PyObject *
Splitter_repeat(Splitter *self, long n)
{
    PyErr_SetString(PyExc_TypeError, "Cannot repeat Splitters.");
    return NULL;
}

static PyObject *
Splitter_slice(Splitter *self, int i, int j)
{
    PyErr_SetString(PyExc_TypeError, "Cannot slice Splitters.");
    return NULL;
}

/*
** __getitem__() support (not used)
*/

static PyObject *
Splitter_item(Splitter *self, int i)
{
    PyObject *item;
    item = PyList_GetItem(self->list, i);
    Py_XINCREF(item);  /* Promote borrowed ref unless exception */
    return item;
}

/*
** Split functions for strings and unicode strings
*/

int splitUnicodeString(Splitter *self,PyObject *doc);
int splitString(Splitter *self,PyObject *doc);

static PyObject *
Splitter_split(Splitter *self, PyObject *args)
{
    PyObject *doc;
    char *encoding = "iso-8859-15";

    Py_XDECREF(self->list);
    self->list = PyList_New(0);
    
    if (! (PyArg_ParseTuple(args,"O|s",&doc, &encoding))) return NULL;

    if (PyString_Check(doc)) {
        if (strlen(encoding) == 0 || !strcmp(encoding,"ascii"))
            splitString(self, doc);
        else {
            PyObject *doc1;
            if (! (doc1 = PyUnicode_FromEncodedObject(doc, encoding, "strict"))) {
                PyErr_SetString(PyExc_UnicodeError,"unicode conversion failed (maybe wrong encoding parameter)");
                return NULL;
            }

            splitUnicodeString(self, doc1);
            Py_XDECREF(doc1);
        } 
    } else if (PyUnicode_Check(doc)) {
        PyObject *doc1; // create a *real* copy since we need to modify the string
        doc1 = PyUnicode_FromUnicode(((PyUnicodeObject *) doc)->str, ((PyUnicodeObject*) doc)->length);
        splitUnicodeString(self, doc1);
        Py_DECREF(doc1);
    } else {
        PyErr_SetString(PyExc_TypeError, "first argument must be  string or unicode");
        return NULL;
    }

    Py_XINCREF(self->list);

    return self->list;
}


/* 
** return a sequence of word positions of a word inside the splitted text
*/

static PyObject *
Splitter_indexes(Splitter *self, PyObject *args)
{
    int i=0, size;
    PyObject *word=NULL,*item=NULL,*r=NULL,*index=NULL;

    if (! (PyArg_ParseTuple(args,"O",&word)))
        return NULL;
    if (! (r=PyList_New(0)))
        return NULL;

    size = PyList_Size(self->list);
    for (i=0;i<size;i++) {
        item=PyList_GET_ITEM(self->list,i);

        if (PyUnicode_Compare(word,item)==0) {
            index=PyInt_FromLong(i);
            if(!index)
                return NULL;
            PyList_Append(r,index);
        }
    }

    return r;
}

/*
** number of splitted words
*/

static int
Splitter_length(Splitter *self)
{
    return PyList_Size(self->list);
}

/*
** class definitions for splitters object
*/

static PySequenceMethods Splitter_as_sequence = {
            (inquiry)Splitter_length,        /*sq_length*/
            (binaryfunc)Splitter_concat,     /*sq_concat*/
            (intargfunc)Splitter_repeat,     /*sq_repeat*/
            (intargfunc)Splitter_item,       /*sq_item*/
            (intintargfunc)Splitter_slice,   /*sq_slice*/
            (intobjargproc)0,                    /*sq_ass_item*/
            (intintobjargproc)0,                 /*sq_ass_slice*/
        };


static struct PyMethodDef Splitter_methods[] =
    {
        { "split", (PyCFunction) Splitter_split, METH_VARARGS,
            "split(doc) -- Split string in one run"
        },
        { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
          "indexes(word) -- Return a list of the indexes of word in the sequence",
        },
        { NULL, NULL }		/* sentinel */
    };


static PyObject *
Splitter_getattr(Splitter *self, char *name)
{
    return Py_FindMethod(Splitter_methods, (PyObject *)self, name);
}

static char SplitterType__doc__[] = "splitter instance for strings or unicode strings";

static PyTypeObject SplitterType = {
                                       PyObject_HEAD_INIT(NULL)
                                       0,                                 /*ob_size*/
                                       "Splitter",                    /*tp_name*/
                                       sizeof(Splitter),              /*tp_basicsize*/
                                       0,                                 /*tp_itemsize*/
                                       /* methods */
                                       (destructor)Splitter_dealloc,  /*tp_dealloc*/
                                       (printfunc)0,                      /*tp_print*/
                                       (getattrfunc)Splitter_getattr, /*tp_getattr*/
                                       (setattrfunc)0,                    /*tp_setattr*/
                                       (cmpfunc)0,                        /*tp_compare*/
                                       (reprfunc)0,                       /*tp_repr*/
                                       0,                                 /*tp_as_number*/
                                       &Splitter_as_sequence,         /*tp_as_sequence*/
                                       0,                                 /*tp_as_mapping*/
                                       (hashfunc)0,                       /*tp_hash*/
                                       (ternaryfunc)0,                    /*tp_call*/
                                       (reprfunc)0,                       /*tp_str*/

                                       /* Space for future expansion */
                                       0L,0L,0L,0L,
                                       SplitterType__doc__ /* Documentation string */
                                   };

/*
** split a non-unicode string
*/

int splitString(Splitter *self,PyObject *doc) 
{
    PyObject *word ;
    char *s,*str;
    int i, inside_word=0, start=0, len;
    register int value, next_value;

    PyString_AsStringAndSize(doc, &str, &len);  
    s = str;

    for (i=0; i<len; i++,s++) {
        char c = *s;

        if (self->casefolding)
            *s = tolower(c);

        value = inode_get(self, c);


        if (value == MISS ) {
            // cache miss

            value = isalnum(c) ? IS_ALNUM : IS_TRASH;
            inode_set(self, c, value);
        }

        if (!inside_word) {
            if (value != IS_TRASH && value != IS_SEPARATOR) {
                start = i;
                inside_word = 1;
            }
        } else {
            
            if (value == IS_SEPARATOR) {
                char next_c = *(s+1);

                next_value = inode_get(self, next_c);

                if (next_value == MISS ) {
                    // cache miss

                    next_value = isalnum(next_c) ? IS_ALNUM : IS_TRASH;
                    inode_set(self, next_c, next_value);
                }
                
                if (next_value == IS_TRASH) { 
                    if (! (i-start<2 && ! self->single_chars)) {
                        word = Py_BuildValue("s#", s-(i-start), min(i-start, self->max_len));
                        PyList_Append(self->list, word);
                        Py_XDECREF(word);
                    }
                    start = i;
                    inside_word = 0;
                }
                
            }
            else if (value == IS_TRASH) {
                if (! (i-start<2 && ! self->single_chars)) {
                    word = Py_BuildValue("s#", s-(i-start), min(i-start, self->max_len));
                    PyList_Append(self->list, word);
                    Py_XDECREF(word);
                }
                start = i;
                inside_word = 0;
            }
        }
    }

    if (inside_word) {
        if (! (i-start<2 && ! self->single_chars)) {
            word = Py_BuildValue("s#", s-(i-start), min(i-start, self->max_len));
            PyList_Append(self->list, word);
            Py_XDECREF(word);
        }
    }

    return 1;
}

/*
** split a unicode string
*/

int splitUnicodeString(Splitter *self,PyObject *doc)
{
    PyObject *word ;
    Py_UNICODE *s;
    int i, inside_word=0, start=0, len;
    register int value, next_value;

    s = ((PyUnicodeObject *) doc)->str;       // start of unicode string
    len = ((PyUnicodeObject *)doc)->length;


    for (i=0; i<len; i++,s++) {
        register Py_UNICODE c;

        c = *s;

        if (self->casefolding)
            *s = Py_UNICODE_TOLOWER(c);

        value = inode_get(self, c);

        if (value == MISS ) {
            // cache miss

            value = Py_UNICODE_ISALNUM(c) ? IS_ALNUM : IS_TRASH;
            inode_set(self, c, value);
        }

        if (!inside_word) {
            if (value != IS_TRASH ) {
                start = i;
                inside_word = 1;
            }
        } else {

            if (value == IS_SEPARATOR) {
                register Py_UNICODE next_c = *(s+1);

                next_value = inode_get(self, next_c);

                if (next_value == MISS ) {
                    // cache miss

                    next_value = Py_UNICODE_ISALNUM(next_c) ? IS_ALNUM : IS_TRASH;
                    inode_set(self, next_c, next_value);
                }
                
                if (next_value == IS_TRASH) { 
                    if (! (i-start<2 && ! self->single_chars)) {
                        word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len));
                        PyList_Append(self->list, word);
                        Py_XDECREF(word);
                    }
                    start = i;
                    inside_word = 0;
                }
                
            }

            else if (value==IS_TRASH) {
                if (! (i-start<2 && ! self->single_chars)) {
                    word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len));
                    PyList_Append(self->list, word);
                    Py_XDECREF(word);
                }
                start = i;
                inside_word = 0;
            }
        }
    }

    if (inside_word) {
        if (! (i-start<2 && ! self->single_chars)) {
            word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len));
            PyList_Append(self->list, word);
            Py_XDECREF(word);
        }
    }

    return 1;
}


/* 
** constructor stuff
*/

static char *splitter_args[] = {"separator","singlechar","maxlen","casefolding",NULL};


static PyObject *
newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
{
    int i, max_len=64, single_char=0, casefolding=1;
    Splitter *self=NULL;
    unsigned char *separator = "";

    if (! (PyArg_ParseTupleAndKeywords(
                    args,
                    keywds,
                    "|siii",
                    splitter_args,
                    &separator,
                    &single_char,
                    &max_len,
                    &casefolding))) return NULL;

    if (casefolding<0 || casefolding>1) {
        PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
        return NULL;
    }

    if (single_char<0 || single_char>1) {
        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
        return NULL;
    }

    if (max_len<1 || max_len>128) {
        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
        return NULL;
    }

    if (! (self = PyObject_NEW(Splitter, &SplitterType)))
        return NULL;

    self->max_len            = max_len;
    self->single_chars       = single_char;
    self->casefolding        = casefolding;
    self->list               = PyList_New(0);
    self->cache              = hashtable_new_txng(
                               (dict_cmp_func) inode_cmp,
                               (dict_hsh_func) inode_hash,
                               (dict_del_func) del_inode,
                               NULL,
                               HASHTABLESIZE);

    // assign small cache
    for (i=0;i<256;i++)
        self->small_cache[i] = UNSET ;

    // assign separator characters to small cache
    for (i=0; i<strlen(separator); i++)  
        self->small_cache[(int) separator[i]] =  IS_SEPARATOR;


    return (PyObject*) self;
}

static struct PyMethodDef Splitter_module_methods[] =
    {
        { "Splitter", (PyCFunction)newSplitter, METH_VARARGS|METH_KEYWORDS,
            "Splitter([,maxlen][,singlechar][,casefolding][,separator]) "
            "-- Return a word splitter"
        },
        { NULL, NULL }
    };

static char Splitter_module_documentation[] =
    "Split a string or a unicode string into a sequence of unicode strings"
    ;


void
initsplitter(void)
{
    Py_InitModule3("splitter", Splitter_module_methods,
                       Splitter_module_documentation);
}




See more files for this project here

TextIndexNG

The next generation fulltext index for the Zope Catalog\r\n\r\nFor details see http://www.zope.org/Members/ajung/TextIndexNG/wiki/TextIndexNG\r\n\r\n

Project homepage: http://sourceforge.net/projects/textindexng
Programming language(s): C,Python
License: other

  tests/
    __init__.py
    testTXNGSplitter.py
  LICENSE
  __init__.py
  dict.c
  dict.h
  dict_private.h
  hashtable.c
  hashtable.h
  splitter.c