bmf

bmf (Bayesian Mail Filter) 0.9.4 fork + patches
git clone git://git.codemadness.org/bmf
Log | Files | Refs | README | LICENSE

commit 60b437c6d0bc19fc9f67ca8cfaf6cbfc50d47423
parent 4c3c79f49125ef555fba1df7f6cbab2c7b26ea00
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 27 Oct 2018 19:31:30 +0200

merge dbh and dbtext (WIP)

Diffstat:
Makefile | 2--
bmf.c | 3+--
dbh.c | 474+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
dbh.h | 45+++++++++++++++++++++++++++++++++++++--------
dbtext.c | 490-------------------------------------------------------------------------------
dbtext.h | 49-------------------------------------------------
filt.h | 14+++++++++-----
7 files changed, 511 insertions(+), 566 deletions(-)

diff --git a/Makefile b/Makefile @@ -9,7 +9,6 @@ SRC = \ bmf.c \ dbg.c \ dbh.c \ - dbtext.c \ filt.c \ lex.c \ str.c \ @@ -21,7 +20,6 @@ HDR = \ config.h \ dbg.h \ dbh.h \ - dbtext.h \ filt.h \ lex.h \ str.h \ diff --git a/bmf.c b/bmf.c @@ -75,7 +75,6 @@ version(void) int main(int argc, char **argv) { - dbfmt_t dbfmt = db_text; char *dbname = NULL; bool_t rdonly; runmode_t mode = mode_normal; @@ -152,7 +151,7 @@ main(int argc, char **argv) } stats.extrema = (discrim_t *) malloc(stats.keepers * sizeof(discrim_t)); - pdb = dbh_open(dbfmt, "localhost", dbname, "", ""); + pdb = dbh_open(dbname); if (pdb == NULL) { fprintf(stderr, "%s: cannot open database\n", argv[0]); exit(2); diff --git a/dbh.c b/dbh.c @@ -16,7 +16,6 @@ #include "vec.h" #include "dbh.h" -#include "dbtext.h" /* * get count for new (incoming) word. there may be duplicate entries for the @@ -50,17 +49,472 @@ db_getnewcount(veciter_t * piter) } dbh_t * -dbh_open(dbfmt_t dbfmt, cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass) +dbh_open(cpchar dbname) { - dbh_t *pdb; + dbhtext_t *pthis = NULL; + uint dirlen; + cpchar phome; + struct stat st; - switch (dbfmt) { - case db_text: - pdb = (dbh_t *) dbtext_db_open(dbhost, dbname, dbuser, dbpass); - break; - default: - break; + if ((pthis = malloc(sizeof(dbhtext_t))) == NULL) { + perror("malloc()"); + goto bail; } - return pdb; + pthis->close = dbtext_db_close; + pthis->opentable = dbtext_db_opentable; + + if (dbname != NULL && *dbname != '\0') { + dirlen = strlen(dbname); + if ((pthis->dir = strdup(dbname)) == NULL) { + perror("strdup()"); + goto bail; + } + if (dirlen && pthis->dir[dirlen - 1] == '/') + pthis->dir[--dirlen] = '\0'; + } else { + phome = getenv("HOME"); + if (phome == NULL || *phome == '\0') { + phome = "."; + } + dirlen = strlen(phome) + 5 + 1; + if ((pthis->dir = malloc(dirlen)) == NULL) + goto bail; + + /* NOTE: no truncation possible */ + snprintf(pthis->dir, dirlen, "%s/.bmf", phome); + } + + /* make sure config directory exists */ + if (stat(pthis->dir, &st) != 0) { + if (errno != ENOENT || + mkdir(pthis->dir, S_IRUSR | S_IWUSR | S_IXUSR) != 0) + goto bail; + } else { + if (!S_ISDIR(st.st_mode)) + goto bail; + } + + /* unveil(2), TODO: rework later */ + /* TODO: permission depending on mode */ + char listpath[PATH_MAX]; + snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist"); + if (unveil(listpath, "rw") == -1) { + perror("unveil()"); + exit(2); + } + snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "spamlist"); + if (unveil(listpath, "rw") == -1) { + perror("unveil()"); + exit(2); + } + if (unveil(NULL, NULL) == -1) { + perror("unveil()"); + exit(2); + } + + return (dbh_t *)pthis; + +bail: + if (pthis) { + if (pthis->dir) + free(pthis->dir); + free(pthis); + } + + return NULL; +} + +static void +dbtext_table_setsize(dbttext_t * pthis, uint nsize) +{ + uint nnewalloc; + rec_t *pnewitems; + uint n; + + if (nsize <= pthis->nalloc) + return; + + nnewalloc = pthis->nalloc * 2; + if (nnewalloc < nsize) + nnewalloc = nsize; + pnewitems = (rec_t *) realloc(pthis->pitems, nnewalloc * sizeof(rec_t)); + if (pnewitems == NULL) { + exit(2); + } + for (n = pthis->nitems; n < nsize; n++) { + str_create(&pnewitems[n].w); + pnewitems[n].n = 0; + } + pthis->pitems = pnewitems; + pthis->nalloc = nnewalloc; +} + +bool_t +dbtext_db_close(dbhtext_t * pthis) +{ + free(pthis->dir); + pthis->dir = NULL; + return true; +} + +dbt_t * +dbtext_db_opentable(dbhtext_t * pthis, cpchar table, bool_t rdonly) +{ + dbttext_t *ptable = NULL; + +#ifndef NOLOCK + struct flock lock; + +#endif /* ndef NOLOCK */ + char szpath[PATH_MAX]; + int flags, ret; + struct stat st; + char *pbegin; + char *pend; + rec_t r; + uint pos; + + if (pthis->dir == NULL) + goto bail; + + if ((ptable = malloc(sizeof(dbttext_t))) == NULL) { + perror("malloc()"); + goto bail; + } + ptable->close = dbtext_table_close; + ptable->mergeclose = dbtext_table_mergeclose; + ptable->unmergeclose = dbtext_table_unmergeclose; + ptable->getmsgcount = dbtext_table_getmsgcount; + ptable->getcount = dbtext_table_getcount; + ptable->fd = -1; + ptable->pbuf = NULL; + ptable->nmsgs = 0; + ptable->nalloc = 0; + ptable->nitems = 0; + ptable->pitems = NULL; + + ret = snprintf(szpath, sizeof(szpath), "%s/%s.txt", pthis->dir, table); + if (ret == -1 || (size_t)ret >= sizeof(szpath)) { + fprintf(stderr, "path truncation: %s/%s.txt", pthis->dir, table); + goto bail; + } + + flags = O_CREAT | (rdonly ? O_RDONLY : O_RDWR); + if ((ptable->fd = open(szpath, flags, 0644)) == -1) { + perror("open()"); + goto bail; + } + +#ifndef NOLOCK + memset(&lock, 0, sizeof(lock)); + lock.l_type = rdonly ? F_RDLCK : F_WRLCK; + lock.l_start = 0; + lock.l_whence = SEEK_SET; + lock.l_len = 0; + fcntl(ptable->fd, F_SETLKW, &lock); +#endif /* ndef NOLOCK */ + + if (fstat(ptable->fd, &st) != 0) { + perror("fstat()"); + goto bail_uc; + } + if (st.st_size == 0) { + return (dbt_t *) ptable; + } + ptable->pbuf = (char *) malloc(st.st_size); + if (ptable->pbuf == NULL) { + perror("malloc()"); + goto bail_uc; + } + if (read(ptable->fd, ptable->pbuf, st.st_size) != st.st_size) { + perror("read()"); + goto bail_fuc; + } + /* XXX: bogofilter compatibility */ + if (sscanf(ptable->pbuf, BOGOFILTER_HEADER, &ptable->nmsgs) != 1) { + goto bail_fuc; + } + pbegin = ptable->pbuf; + while (*pbegin != '\n') + pbegin++; + pbegin++; + + pos = 0; + while (pbegin < ptable->pbuf + st.st_size) { + pend = pbegin; + r.w.p = pbegin; + r.w.len = 0; + r.n = 0; + + while (*pend != '\n') { + if (pend >= ptable->pbuf + st.st_size) { + goto bail_fuc; + } + *pend = tolower(*pend); + if (*pend == ' ') { + r.w.len = (pend - pbegin); + r.n = strtol(pend + 1, NULL, 10); + } + pend++; + } + if (pend > pbegin && *pbegin != '#' && *pbegin != ';') { + if (r.w.len == 0 || r.w.len > MAXWORDLEN) { + fprintf(stderr, "dbh_loadfile: bad file format\n"); + goto bail_fuc; + } + dbtext_table_setsize(ptable, pos + 1); + ptable->pitems[pos++] = r; + ptable->nitems = pos; + } + pbegin = pend + 1; + } + + if (rdonly) { +#ifndef NOLOCK + lock.l_type = F_UNLCK; + fcntl(ptable->fd, F_SETLKW, &lock); +#endif /* ndef NOLOCK */ + close(ptable->fd); + ptable->fd = -1; + } + return (dbt_t *) ptable; + +bail_fuc: + free(ptable->pbuf); + +bail_uc: +#ifndef NOLOCK + lock.l_type = F_UNLCK; + fcntl(ptable->fd, F_SETLKW, &lock); +#endif /* ndef NOLOCK */ + + close(ptable->fd); + ptable->fd = -1; + +bail: + free(ptable); + return NULL; +} + +bool_t +dbtext_table_close(dbttext_t * pthis) +{ + struct flock lockall; + + free(pthis->pbuf); + pthis->pbuf = NULL; + free(pthis->pitems); + pthis->pitems = NULL; + + if (pthis->fd != -1) { +#ifndef NOLOCK + memset(&lockall, 0, sizeof(lockall)); + lockall.l_type = F_UNLCK; + lockall.l_start = 0; + lockall.l_whence = SEEK_SET; + lockall.l_len = 0; + fcntl(pthis->fd, F_SETLKW, &lockall); +#endif /* ndef NOLOCK */ + close(pthis->fd); + pthis->fd = -1; + } + return true; +} + +bool_t +dbtext_table_mergeclose(dbttext_t * pthis, vec_t * pmsg) +{ + /* note that we require both vectors to be sorted */ + + uint pos; + rec_t *prec; + veciter_t msgiter; + str_t *pmsgstr; + uint count; + char iobuf[IOBUFSIZE]; + char *p; + + if (pthis->fd == -1) { + return false; + } + ftruncate(pthis->fd, 0); + lseek(pthis->fd, 0, SEEK_SET); + + pthis->nmsgs++; + + p = iobuf; + p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs); + + vec_first(pmsg, &msgiter); + pmsgstr = veciter_get(&msgiter); + + pos = 0; + while (pos < pthis->nitems || pmsgstr != NULL) { + int cmp = 0; + + prec = &pthis->pitems[pos]; + if (pmsgstr != NULL && pos < pthis->nitems) { + cmp = str_casecmp(&prec->w, pmsgstr); + } else { + /* we exhausted one list or the other (but not both) */ + cmp = (pos < pthis->nitems) ? -1 : 1; + } + if (cmp < 0) { + /* write existing str */ + count = prec->n; + strncpylwr(p, prec->w.p, prec->w.len); + p += prec->w.len; + *p++ = ' '; + p += sprintf(p, "%u\n", count); + + pos++; + } else if (cmp == 0) { + /* same str, merge and write sum */ + count = db_getnewcount(&msgiter); + count += prec->n; + strncpylwr(p, prec->w.p, prec->w.len); + p += prec->w.len; + *p++ = ' '; + p += sprintf(p, "%u\n", count); + + pos++; + veciter_next(&msgiter); + pmsgstr = veciter_get(&msgiter); + } else { /* cmp > 0 */ + /* write new str */ + count = db_getnewcount(&msgiter); + strncpylwr(p, pmsgstr->p, pmsgstr->len); + p += pmsgstr->len; + *p++ = ' '; + p += sprintf(p, "%u\n", count); + + veciter_next(&msgiter); + pmsgstr = veciter_get(&msgiter); + } + + if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) { + write(pthis->fd, iobuf, p - iobuf); + p = iobuf; + } + } + if (p != iobuf) { + write(pthis->fd, iobuf, p - iobuf); + } + veciter_destroy(&msgiter); + return dbtext_table_close(pthis); +} + +bool_t +dbtext_table_unmergeclose(dbttext_t * pthis, vec_t * pmsg) +{ + /* note that we require both vectors to be sorted */ + + uint pos; + rec_t *prec; + veciter_t msgiter; + str_t *pmsgstr; + uint count; + char iobuf[IOBUFSIZE]; + char *p; + + if (pthis->fd == -1) { + return false; + } + ftruncate(pthis->fd, 0); + lseek(pthis->fd, 0, SEEK_SET); + + pthis->nmsgs--; + + p = iobuf; + p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs); + + vec_first(pmsg, &msgiter); + pmsgstr = veciter_get(&msgiter); + + pos = 0; + while (pos < pthis->nitems || pmsgstr != NULL) { + int cmp = 0; + + prec = &pthis->pitems[pos]; + if (pmsgstr != NULL && pos < pthis->nitems) { + cmp = str_casecmp(&prec->w, pmsgstr); + } else { + /* we exhausted one list or the other (but not both) */ + cmp = (pos < pthis->nitems) ? -1 : 1; + } + if (cmp < 0) { + /* write existing str */ + count = prec->n; + strncpylwr(p, prec->w.p, prec->w.len); + p += prec->w.len; + *p++ = ' '; + p += sprintf(p, "%u\n", count); + + pos++; + } else if (cmp == 0) { + /* same str, merge and write difference */ + count = db_getnewcount(&msgiter); + count = (prec->n > count) ? (prec->n - count) : 0; + strncpylwr(p, prec->w.p, prec->w.len); + p += prec->w.len; + *p++ = ' '; + p += sprintf(p, "%u\n", count); + + pos++; + veciter_next(&msgiter); + pmsgstr = veciter_get(&msgiter); + } else { /* cmp > 0 */ + /* this should not happen, so write with count=0 */ + db_getnewcount(&msgiter); + count = 0; + strncpylwr(p, pmsgstr->p, pmsgstr->len); + p += pmsgstr->len; + *p++ = ' '; + p += sprintf(p, "%u\n", count); + + veciter_next(&msgiter); + pmsgstr = veciter_get(&msgiter); + } + + if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) { + write(pthis->fd, iobuf, p - iobuf); + p = iobuf; + } + } + if (p != iobuf) { + write(pthis->fd, iobuf, p - iobuf); + } + veciter_destroy(&msgiter); + return dbtext_table_close(pthis); +} + +uint +dbtext_table_getmsgcount(dbttext_t * pthis) +{ + return pthis->nmsgs; +} + +uint +dbtext_table_getcount(dbttext_t * pthis, str_t * pword) +{ + int lo, hi, mid; + + if (pthis->nitems == 0) { + return 0; + } + hi = pthis->nitems - 1; + lo = -1; + while (hi - lo > 1) { + mid = (hi + lo) / 2; + if (str_casecmp(pword, &pthis->pitems[mid].w) <= 0) + hi = mid; + else + lo = mid; + } + + if (str_casecmp(pword, &pthis->pitems[hi].w) != 0) { + return 0; + } + return pthis->pitems[hi].n; } diff --git a/dbh.h b/dbh.h @@ -10,16 +10,14 @@ #ifndef _DBH_H #define _DBH_H -/* database formats */ -typedef enum { - db_text /* flat text */ -} dbfmt_t; +#define BOGOFILTER_HEADER "# bogofilter wordlist (format version A): %u\n" +#define TEXTDB_MAXLINELEN (MAXWORDLEN+32) /* record/field structure */ typedef struct _rec { str_t w; uint n; -} rec_t; +} rec_t; /* database table */ typedef struct _dbt dbt_t; @@ -38,11 +36,42 @@ struct _dbh { dbt_t *(*opentable) (dbh_t *, cpchar, bool_t); }; -dbh_t *dbh_open(dbfmt_t dbfmt, cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass); +typedef struct _dbttext dbttext_t; +struct _dbttext +{ + bool_t (*close)(dbttext_t*); + bool_t (*mergeclose)(dbttext_t*,vec_t*); + bool_t (*unmergeclose)(dbttext_t*,vec_t*); + uint (*getmsgcount)(dbttext_t*); + uint (*getcount)(dbttext_t*,str_t*); -#define BOGOFILTER_HEADER "# bogofilter wordlist (format version A): %u\n" -#define TEXTDB_MAXLINELEN (MAXWORDLEN+32) + int fd; /* file descriptor, if currently open */ + char* pbuf; /* data buffer, if currently open */ + uint nmsgs; /* number of messages represented in list */ + uint nalloc; /* items alloced in pitems */ + uint nitems; /* items available */ + rec_t* pitems; /* growing vector of items */ +}; + +typedef struct _dbhtext dbhtext_t; +struct _dbhtext +{ + bool_t (*close)(dbhtext_t*); + dbt_t* (*opentable)(dbhtext_t*,cpchar,bool_t); + + char* dir; +}; uint db_getnewcount(veciter_t * piter); +dbh_t* dbtext_db_open(cpchar dbname); +bool_t dbtext_db_close( dbhtext_t* pthis ); +dbt_t* dbtext_db_opentable( dbhtext_t* pthis, cpchar table, bool_t rdonly ); + +bool_t dbtext_table_close( dbttext_t* pthis ); +bool_t dbtext_table_mergeclose( dbttext_t* pthis, vec_t* pmsg ); +bool_t dbtext_table_unmergeclose( dbttext_t* pthis, vec_t* pmsg ); +uint dbtext_table_getmsgcount( dbttext_t* pthis ); +uint dbtext_table_getcount( dbttext_t* pthis, str_t* pword ); + #endif /* ndef _DBH_H */ diff --git a/dbtext.c b/dbtext.c @@ -1,490 +0,0 @@ -/* $Id: dbtext.c,v 1.12 2002/10/19 09:59:35 tommy Exp $ */ - -/* - * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com> - * - * This program is free software. It may be distributed under the terms - * in the file LICENSE, found in the top level of the distribution. - * - * dbtext.c: flatfile database handler - */ - -#include "config.h" -#include "dbg.h" -#include "str.h" -#include "lex.h" -#include "vec.h" - -#include "dbh.h" -#include "dbtext.h" - -static void -dbtext_table_setsize(dbttext_t * pthis, uint nsize) -{ - uint nnewalloc; - rec_t *pnewitems; - uint n; - - if (nsize <= pthis->nalloc) - return; - - nnewalloc = pthis->nalloc * 2; - if (nnewalloc < nsize) - nnewalloc = nsize; - pnewitems = (rec_t *) realloc(pthis->pitems, nnewalloc * sizeof(rec_t)); - if (pnewitems == NULL) { - exit(2); - } - for (n = pthis->nitems; n < nsize; n++) { - str_create(&pnewitems[n].w); - pnewitems[n].n = 0; - } - pthis->pitems = pnewitems; - pthis->nalloc = nnewalloc; -} - -dbh_t * -dbtext_db_open(cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass) -{ - dbhtext_t *pthis = NULL; - uint dirlen; - cpchar phome; - struct stat st; - - if ((pthis = malloc(sizeof(dbhtext_t))) == NULL) { - perror("malloc()"); - goto bail; - } - - pthis->close = dbtext_db_close; - pthis->opentable = dbtext_db_opentable; - - if (dbname != NULL && *dbname != '\0') { - dirlen = strlen(dbname); - if ((pthis->dir = strdup(dbname)) == NULL) { - perror("strdup()"); - goto bail; - } - if (dirlen && pthis->dir[dirlen - 1] == '/') - pthis->dir[--dirlen] = '\0'; - } else { - phome = getenv("HOME"); - if (phome == NULL || *phome == '\0') { - phome = "."; - } - dirlen = strlen(phome) + 5 + 1; - if ((pthis->dir = malloc(dirlen)) == NULL) - goto bail; - - /* NOTE: no truncation possible */ - snprintf(pthis->dir, dirlen, "%s/.bmf", phome); - } - - /* make sure config directory exists */ - if (stat(pthis->dir, &st) != 0) { - if (errno != ENOENT || - mkdir(pthis->dir, S_IRUSR | S_IWUSR | S_IXUSR) != 0) - goto bail; - } else { - if (!S_ISDIR(st.st_mode)) - goto bail; - } - - /* unveil(2), TODO: rework later */ - /* TODO: permission depending on mode */ - char listpath[PATH_MAX]; - snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist"); - if (unveil(listpath, "rw") == -1) { - perror("unveil()"); - exit(2); - } - snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "spamlist"); - if (unveil(listpath, "rw") == -1) { - perror("unveil()"); - exit(2); - } - if (unveil(NULL, NULL) == -1) { - perror("unveil()"); - exit(2); - } - - return (dbh_t *)pthis; - -bail: - if (pthis) { - if (pthis->dir) - free(pthis->dir); - free(pthis); - } - - return NULL; -} - -bool_t -dbtext_db_close(dbhtext_t * pthis) -{ - free(pthis->dir); - pthis->dir = NULL; - return true; -} - -dbt_t * -dbtext_db_opentable(dbhtext_t * pthis, cpchar table, bool_t rdonly) -{ - dbttext_t *ptable = NULL; - -#ifndef NOLOCK - struct flock lock; - -#endif /* ndef NOLOCK */ - char szpath[PATH_MAX]; - int flags, ret; - struct stat st; - char *pbegin; - char *pend; - rec_t r; - uint pos; - - if (pthis->dir == NULL) - goto bail; - - if ((ptable = malloc(sizeof(dbttext_t))) == NULL) { - perror("malloc()"); - goto bail; - } - ptable->close = dbtext_table_close; - ptable->mergeclose = dbtext_table_mergeclose; - ptable->unmergeclose = dbtext_table_unmergeclose; - ptable->getmsgcount = dbtext_table_getmsgcount; - ptable->getcount = dbtext_table_getcount; - ptable->fd = -1; - ptable->pbuf = NULL; - ptable->nmsgs = 0; - ptable->nalloc = 0; - ptable->nitems = 0; - ptable->pitems = NULL; - - ret = snprintf(szpath, sizeof(szpath), "%s/%s.txt", pthis->dir, table); - if (ret == -1 || (size_t)ret >= sizeof(szpath)) { - fprintf(stderr, "path truncation: %s/%s.txt", pthis->dir, table); - goto bail; - } - - flags = O_CREAT | (rdonly ? O_RDONLY : O_RDWR); - if ((ptable->fd = open(szpath, flags, 0644)) == -1) { - perror("open()"); - goto bail; - } - -#ifndef NOLOCK - memset(&lock, 0, sizeof(lock)); - lock.l_type = rdonly ? F_RDLCK : F_WRLCK; - lock.l_start = 0; - lock.l_whence = SEEK_SET; - lock.l_len = 0; - fcntl(ptable->fd, F_SETLKW, &lock); -#endif /* ndef NOLOCK */ - - if (fstat(ptable->fd, &st) != 0) { - perror("fstat()"); - goto bail_uc; - } - if (st.st_size == 0) { - return (dbt_t *) ptable; - } - ptable->pbuf = (char *) malloc(st.st_size); - if (ptable->pbuf == NULL) { - perror("malloc()"); - goto bail_uc; - } - if (read(ptable->fd, ptable->pbuf, st.st_size) != st.st_size) { - perror("read()"); - goto bail_fuc; - } - /* XXX: bogofilter compatibility */ - if (sscanf(ptable->pbuf, BOGOFILTER_HEADER, &ptable->nmsgs) != 1) { - goto bail_fuc; - } - pbegin = ptable->pbuf; - while (*pbegin != '\n') - pbegin++; - pbegin++; - - pos = 0; - while (pbegin < ptable->pbuf + st.st_size) { - pend = pbegin; - r.w.p = pbegin; - r.w.len = 0; - r.n = 0; - - while (*pend != '\n') { - if (pend >= ptable->pbuf + st.st_size) { - goto bail_fuc; - } - *pend = tolower(*pend); - if (*pend == ' ') { - r.w.len = (pend - pbegin); - r.n = strtol(pend + 1, NULL, 10); - } - pend++; - } - if (pend > pbegin && *pbegin != '#' && *pbegin != ';') { - if (r.w.len == 0 || r.w.len > MAXWORDLEN) { - fprintf(stderr, "dbh_loadfile: bad file format\n"); - goto bail_fuc; - } - dbtext_table_setsize(ptable, pos + 1); - ptable->pitems[pos++] = r; - ptable->nitems = pos; - } - pbegin = pend + 1; - } - - if (rdonly) { -#ifndef NOLOCK - lock.l_type = F_UNLCK; - fcntl(ptable->fd, F_SETLKW, &lock); -#endif /* ndef NOLOCK */ - close(ptable->fd); - ptable->fd = -1; - } - return (dbt_t *) ptable; - -bail_fuc: - free(ptable->pbuf); - -bail_uc: -#ifndef NOLOCK - lock.l_type = F_UNLCK; - fcntl(ptable->fd, F_SETLKW, &lock); -#endif /* ndef NOLOCK */ - - close(ptable->fd); - ptable->fd = -1; - -bail: - free(ptable); - return NULL; -} - -bool_t -dbtext_table_close(dbttext_t * pthis) -{ - struct flock lockall; - - free(pthis->pbuf); - pthis->pbuf = NULL; - free(pthis->pitems); - pthis->pitems = NULL; - - if (pthis->fd != -1) { -#ifndef NOLOCK - memset(&lockall, 0, sizeof(lockall)); - lockall.l_type = F_UNLCK; - lockall.l_start = 0; - lockall.l_whence = SEEK_SET; - lockall.l_len = 0; - fcntl(pthis->fd, F_SETLKW, &lockall); -#endif /* ndef NOLOCK */ - close(pthis->fd); - pthis->fd = -1; - } - return true; -} - -bool_t -dbtext_table_mergeclose(dbttext_t * pthis, vec_t * pmsg) -{ - /* note that we require both vectors to be sorted */ - - uint pos; - rec_t *prec; - veciter_t msgiter; - str_t *pmsgstr; - uint count; - char iobuf[IOBUFSIZE]; - char *p; - - if (pthis->fd == -1) { - return false; - } - ftruncate(pthis->fd, 0); - lseek(pthis->fd, 0, SEEK_SET); - - pthis->nmsgs++; - - p = iobuf; - p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs); - - vec_first(pmsg, &msgiter); - pmsgstr = veciter_get(&msgiter); - - pos = 0; - while (pos < pthis->nitems || pmsgstr != NULL) { - int cmp = 0; - - prec = &pthis->pitems[pos]; - if (pmsgstr != NULL && pos < pthis->nitems) { - cmp = str_casecmp(&prec->w, pmsgstr); - } else { - /* we exhausted one list or the other (but not both) */ - cmp = (pos < pthis->nitems) ? -1 : 1; - } - if (cmp < 0) { - /* write existing str */ - count = prec->n; - strncpylwr(p, prec->w.p, prec->w.len); - p += prec->w.len; - *p++ = ' '; - p += sprintf(p, "%u\n", count); - - pos++; - } else if (cmp == 0) { - /* same str, merge and write sum */ - count = db_getnewcount(&msgiter); - count += prec->n; - strncpylwr(p, prec->w.p, prec->w.len); - p += prec->w.len; - *p++ = ' '; - p += sprintf(p, "%u\n", count); - - pos++; - veciter_next(&msgiter); - pmsgstr = veciter_get(&msgiter); - } else { /* cmp > 0 */ - /* write new str */ - count = db_getnewcount(&msgiter); - strncpylwr(p, pmsgstr->p, pmsgstr->len); - p += pmsgstr->len; - *p++ = ' '; - p += sprintf(p, "%u\n", count); - - veciter_next(&msgiter); - pmsgstr = veciter_get(&msgiter); - } - - if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) { - write(pthis->fd, iobuf, p - iobuf); - p = iobuf; - } - } - if (p != iobuf) { - write(pthis->fd, iobuf, p - iobuf); - } - veciter_destroy(&msgiter); - return dbtext_table_close(pthis); -} - -bool_t -dbtext_table_unmergeclose(dbttext_t * pthis, vec_t * pmsg) -{ - /* note that we require both vectors to be sorted */ - - uint pos; - rec_t *prec; - veciter_t msgiter; - str_t *pmsgstr; - uint count; - char iobuf[IOBUFSIZE]; - char *p; - - if (pthis->fd == -1) { - return false; - } - ftruncate(pthis->fd, 0); - lseek(pthis->fd, 0, SEEK_SET); - - pthis->nmsgs--; - - p = iobuf; - p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs); - - vec_first(pmsg, &msgiter); - pmsgstr = veciter_get(&msgiter); - - pos = 0; - while (pos < pthis->nitems || pmsgstr != NULL) { - int cmp = 0; - - prec = &pthis->pitems[pos]; - if (pmsgstr != NULL && pos < pthis->nitems) { - cmp = str_casecmp(&prec->w, pmsgstr); - } else { - /* we exhausted one list or the other (but not both) */ - cmp = (pos < pthis->nitems) ? -1 : 1; - } - if (cmp < 0) { - /* write existing str */ - count = prec->n; - strncpylwr(p, prec->w.p, prec->w.len); - p += prec->w.len; - *p++ = ' '; - p += sprintf(p, "%u\n", count); - - pos++; - } else if (cmp == 0) { - /* same str, merge and write difference */ - count = db_getnewcount(&msgiter); - count = (prec->n > count) ? (prec->n - count) : 0; - strncpylwr(p, prec->w.p, prec->w.len); - p += prec->w.len; - *p++ = ' '; - p += sprintf(p, "%u\n", count); - - pos++; - veciter_next(&msgiter); - pmsgstr = veciter_get(&msgiter); - } else { /* cmp > 0 */ - /* this should not happen, so write with count=0 */ - db_getnewcount(&msgiter); - count = 0; - strncpylwr(p, pmsgstr->p, pmsgstr->len); - p += pmsgstr->len; - *p++ = ' '; - p += sprintf(p, "%u\n", count); - - veciter_next(&msgiter); - pmsgstr = veciter_get(&msgiter); - } - - if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) { - write(pthis->fd, iobuf, p - iobuf); - p = iobuf; - } - } - if (p != iobuf) { - write(pthis->fd, iobuf, p - iobuf); - } - veciter_destroy(&msgiter); - return dbtext_table_close(pthis); -} - -uint -dbtext_table_getmsgcount(dbttext_t * pthis) -{ - return pthis->nmsgs; -} - -uint -dbtext_table_getcount(dbttext_t * pthis, str_t * pword) -{ - int lo, hi, mid; - - if (pthis->nitems == 0) { - return 0; - } - hi = pthis->nitems - 1; - lo = -1; - while (hi - lo > 1) { - mid = (hi + lo) / 2; - if (str_casecmp(pword, &pthis->pitems[mid].w) <= 0) - hi = mid; - else - lo = mid; - } - - if (str_casecmp(pword, &pthis->pitems[hi].w) != 0) { - return 0; - } - return pthis->pitems[hi].n; -} diff --git a/dbtext.h b/dbtext.h @@ -1,49 +0,0 @@ -/* $Id: dbtext.h,v 1.3 2002/10/02 04:45:40 tommy Exp $ */ - -/* - * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com> - * - * This program is free software. It may be distributed under the terms - * in the file LICENSE, found in the top level of the distribution. - */ - -#ifndef _DBTEXT_H -#define _DBTEXT_H - -typedef struct _dbttext dbttext_t; -struct _dbttext -{ - bool_t (*close)(dbttext_t*); - bool_t (*mergeclose)(dbttext_t*,vec_t*); - bool_t (*unmergeclose)(dbttext_t*,vec_t*); - uint (*getmsgcount)(dbttext_t*); - uint (*getcount)(dbttext_t*,str_t*); - - int fd; /* file descriptor, if currently open */ - char* pbuf; /* data buffer, if currently open */ - uint nmsgs; /* number of messages represented in list */ - uint nalloc; /* items alloced in pitems */ - uint nitems; /* items available */ - rec_t* pitems; /* growing vector of items */ -}; - -typedef struct _dbhtext dbhtext_t; -struct _dbhtext -{ - bool_t (*close)(dbhtext_t*); - dbt_t* (*opentable)(dbhtext_t*,cpchar,bool_t); - - char* dir; -}; - -dbh_t* dbtext_db_open( cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass ); -bool_t dbtext_db_close( dbhtext_t* pthis ); -dbt_t* dbtext_db_opentable( dbhtext_t* pthis, cpchar table, bool_t rdonly ); - -bool_t dbtext_table_close( dbttext_t* pthis ); -bool_t dbtext_table_mergeclose( dbttext_t* pthis, vec_t* pmsg ); -bool_t dbtext_table_unmergeclose( dbttext_t* pthis, vec_t* pmsg ); -uint dbtext_table_getmsgcount( dbttext_t* pthis ); -uint dbtext_table_getcount( dbttext_t* pthis, str_t* pword ); - -#endif /* ndef _DBTEXT_H */ diff --git a/filt.h b/filt.h @@ -10,17 +10,21 @@ #ifndef _FILT_H #define _FILT_H +#include "lex.h" +#include "str.h" +#include "vec.h" + typedef struct { - str_t key; - double prob; + str_t key; + double prob; } discrim_t; typedef struct { - double spamicity; - uint keepers; - discrim_t* extrema; + double spamicity; + uint keepers; + discrim_t* extrema; } stats_t; void statdump( stats_t* pstat, int fd );