Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

load.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: load.c,v 1.7 2005/10/14 12:50:37 bostic Exp $
00008  */
00009 
00010 #include "csv.h"
00011 #include "csv_local.h"
00012 #include "csv_extern.h"
00013 
00014 typedef enum { GL_OK, GL_EOF, GL_FAIL } getline_status;
00015 
00016 static int input_field_count(const char *, size_t, u_int32_t *);
00017 static getline_status
00018            input_getline(char **, size_t *, size_t *);
00019 static int input_put_alloc(u_int32_t **, size_t *, size_t, u_int32_t);
00020 static int input_set_offset(u_int32_t *, char *, size_t, u_int32_t);
00021 
00022 static input_fmt ifmt;                  /* Input format. */
00023 static u_long    record_count = 0;      /* Input record count for errors. */
00024 static u_long    version;               /* Version we're loading. */
00025 
00026 /*
00027  * input_load --
00028  *      Read the input file and load new records into the database.
00029  */
00030 int
00031 input_load(input_fmt ifmt_arg, u_long version_arg)
00032 {
00033         getline_status gtl_status;
00034         DBT key, data;
00035         DBC *cursor;
00036         u_int32_t field_count, primary_key, *put_line;
00037         size_t input_len, len, put_len;
00038         int is_first, ret;
00039         char *input_line;
00040 
00041         field_count = 0;                        /* Shut the compiler up. */
00042 
00043         /* ifmt and version are global to this file. */
00044         ifmt = ifmt_arg;
00045         version = version_arg;
00046 
00047         /*
00048          * The primary key for the database is a unique number.  Find out the
00049          * last unique number allocated in this database by opening a cursor
00050          * and fetching the last record.
00051          */
00052         if ((ret = db->cursor(db, NULL, &cursor, 0)) != 0) {
00053                 dbenv->err(dbenv, ret, "DB->cursor");
00054                 return (1);
00055         }
00056         memset(&key, 0, sizeof(key));
00057         memset(&data, 0, sizeof(data));
00058         if ((ret = cursor->c_get(cursor, &key, &data, DB_LAST)) != 0)
00059                 if (ret == DB_NOTFOUND)
00060                         primary_key = 0;
00061                 else {
00062                         dbenv->err(dbenv, ret, "DB->cursor: DB_LAST");
00063                         return (1);
00064                 }
00065         else
00066                 memcpy(&primary_key, key.data, sizeof(primary_key));
00067         if ((ret = cursor->c_close(cursor)) != 0) {
00068                 dbenv->err(dbenv, ret, "DBC->close");
00069                 return (1);
00070         }
00071         if (verbose)
00072                 dbenv->errx(dbenv,
00073                     "maximum existing record in the database is %lu",
00074                     (u_long)primary_key);
00075 
00076         key.data = &primary_key;
00077         key.size = sizeof(primary_key);
00078         input_line = NULL;
00079         put_line = NULL;
00080         input_len = put_len = 0;
00081 
00082         /*
00083          * See the README file for a description of the file input format.
00084          */
00085         for (is_first = 1; (gtl_status =
00086             input_getline(&input_line, &input_len, &len)) == GL_OK;) {
00087                 ++record_count;
00088                 if (verbose > 1)
00089                         dbenv->errx(dbenv, "reading %lu", (u_long)record_count);
00090 
00091                 /* The first non-blank line of the input is a column map. */
00092                 if (is_first) {
00093                         is_first = 0;
00094 
00095                         /* Count the fields we're expecting in the input. */
00096                         if (input_field_count(
00097                             input_line, len, &field_count) != 0)
00098                                 return (1);
00099 
00100                 }
00101 
00102                 /* Allocate room for the table of offsets. */
00103                 if (input_put_alloc(
00104                     &put_line, &put_len, len, field_count) != 0)
00105                         return (1);
00106 
00107                 /*
00108                  * Build the offset table and create the record we're
00109                  * going to store.
00110                  */
00111                 if (input_set_offset(put_line,
00112                     input_line, len, field_count) != 0)
00113                         return (1);
00114 
00115                 ++primary_key;
00116 
00117                 memcpy(put_line + (field_count + 2), input_line, len);
00118                 data.data = put_line;
00119                 data.size = (field_count + 2) * sizeof(u_int32_t) + len;
00120 
00121                 if (verbose > 1)
00122                         (void)entry_print(
00123                             data.data, data.size, field_count);
00124 
00125                 /* Load the key/data pair into the database. */
00126                 if ((ret = db->put(db, NULL, &key, &data, 0)) != 0) {
00127                         dbenv->err(dbenv, ret,
00128                             "DB->put: %lu", (u_long)primary_key);
00129                         return (1);
00130                 }
00131         }
00132 
00133         if (gtl_status != GL_EOF)
00134                 return (1);
00135 
00136         if (verbose)
00137                 dbenv->errx(dbenv,
00138                     "%lu records read from the input file into the database",
00139                     record_count);
00140 
00141         /*
00142          * This program isn't transactional, limit the window for corruption.
00143          */
00144         if ((ret = db->sync(db, 0)) != 0) {
00145                 dbenv->err(dbenv, ret, "DB->sync");
00146                 return (1);
00147         }
00148 
00149         return (0);
00150 }
00151 
00152 /*
00153  * input_getline --
00154  *      Read in a line of input into a buffer.
00155  */
00156 static getline_status
00157 input_getline(char **input_linep, size_t *input_lenp, size_t *lenp)
00158 {
00159         size_t input_len, len;
00160         int ch;
00161         char *input_line, *p, *endp;
00162 
00163         input_line = *input_linep;
00164         input_len = *input_lenp;
00165 
00166         p = input_line;
00167         endp = input_line + input_len;
00168 
00169         for (len = 0; (ch = getchar()) != EOF;) {
00170                 if (ch == '\0')         /* Strip <nul> (\000) bytes. */
00171                         continue;
00172                 switch (ifmt) {
00173                 case FORMAT_NL:
00174                         if (ch == '\n')
00175                                 goto end;
00176                         break;
00177                 case FORMAT_EXCEL:
00178                         /* Strip <nl> (\012) bytes. */
00179                         if (ch == '\n')
00180                                 continue;
00181                         /*
00182                          * <cr> (\015) bytes terminate lines.
00183                          * Skip blank lines.
00184                          */
00185                         if (ch == '\015') {
00186                                 if (len == 0)
00187                                         continue;
00188                                 goto end;
00189                         }
00190                 }
00191                 if (input_line == endp) {
00192                         input_len += 256;
00193                         input_len *= 2;
00194                         if ((input_line =
00195                             realloc(input_line, input_len)) == NULL) {
00196                                 dbenv->err(dbenv, errno,
00197                                     "unable to allocate %lu bytes for record",
00198                                     (u_long)input_len);
00199                                 return (GL_FAIL);
00200                         }
00201                         p = input_line;
00202                         endp = p + input_len;
00203                 }
00204 
00205                 if (isprint(ch)) {      /* Strip unprintables. */
00206                         *p++ = (char)ch;
00207                         ++len;
00208                 }
00209         }
00210 
00211 end:    if (len == 0)
00212                 return (GL_EOF);
00213 
00214         *lenp = len;
00215         *input_linep = input_line;
00216         *input_lenp = input_len;
00217 
00218         return (GL_OK);
00219 }
00220 
00221 /*
00222  * input_field_count --
00223  *      Count the fields in the line.
00224  */
00225 static int
00226 input_field_count(const char *line, size_t len, u_int32_t *field_countp)
00227 {
00228         u_int32_t field_count;
00229         int quoted;
00230 
00231         field_count = 1;
00232 
00233         /*
00234          * There are N-1 separators for N fields, that is, "a,b,c" is three
00235          * fields, with two comma separators.
00236          */
00237         switch (ifmt) {
00238         case FORMAT_EXCEL:
00239                 quoted = 0;
00240                 for (field_count = 1; len > 0; ++line, --len)
00241                         if (*line == '"')
00242                                 quoted = !quoted;
00243                         else if (*line == ',' && !quoted)
00244                                 ++field_count;
00245                 break;
00246         case FORMAT_NL:
00247                 for (field_count = 1; len > 0; ++line, --len)
00248                         if (*line == ',')
00249                                 ++field_count;
00250                 break;
00251         }
00252         *field_countp = field_count;
00253 
00254         if (verbose)
00255                 dbenv->errx(dbenv,
00256                     "input file made up of %lu fields", (u_int)field_count);
00257 
00258         return (0);
00259 }
00260 
00261 /*
00262  * input_put_alloc --
00263  *      Allocate room for the offset table plus the input.
00264  */
00265 static int
00266 input_put_alloc(u_int32_t **put_linep,
00267     size_t *put_lenp, size_t len, u_int32_t field_count)
00268 {
00269         size_t total;
00270 
00271         total = (field_count + 2) * sizeof(u_int32_t) + len;
00272         if (total > *put_lenp &&
00273             (*put_linep = realloc(*put_linep, *put_lenp += total)) == NULL) {
00274                 dbenv->err(dbenv, errno,
00275                     "unable to allocate %lu bytes for record",
00276                     (u_long)*put_lenp);
00277                 return (1);
00278         }
00279         return (0);
00280 }
00281 
00282 /*
00283  * input_set_offset --
00284  *      Build an offset table and record combination.
00285  */
00286 static int
00287 input_set_offset(u_int32_t *put_line,
00288     char *input_line, size_t len, u_int32_t field_count)
00289 {
00290         u_int32_t *op;
00291         int quoted;
00292         char *p, *endp;
00293 
00294         op = put_line;
00295 
00296         /* The first field is the version number. */
00297         *op++ = version;
00298 
00299         /*
00300          * Walk the input line, looking for comma separators.  It's an error
00301          * to have too many or too few fields.
00302          */
00303         *op++ = 0;
00304         quoted = 0;
00305         for (p = input_line, endp = input_line + len;; ++p) {
00306                 if (ifmt == FORMAT_EXCEL && p < endp) {
00307                         if (*p == '"')
00308                                 quoted = !quoted;
00309                         if (quoted)
00310                                 continue;
00311                 }
00312                 if (*p == ',' || p == endp) {
00313                         if (field_count == 0) {
00314                                 dbenv->errx(dbenv,
00315                                     "record %lu: too many fields in the record",
00316                                     record_count);
00317                                 return (1);
00318                         }
00319                         --field_count;
00320 
00321                         *op++ = (u_int32_t)(p - input_line) + 1;
00322 
00323                         if (verbose > 1)
00324                                 dbenv->errx(dbenv,
00325                                     "offset %lu: {%.*s}", op[-1],
00326                                     OFFSET_LEN(op, -2), input_line + op[-2]);
00327 
00328                         /*
00329                          * Don't insert a new field if the input lines ends
00330                          * in a comma.
00331                          */
00332                         if (p == endp || p + 1 == endp)
00333                                 break;
00334                 }
00335         }
00336         *op++ = (u_int32_t)(p - input_line);
00337 
00338         if (field_count != 0) {
00339                 dbenv->errx(dbenv,
00340                     "record %lu: not enough fields in the record",
00341                     record_count);
00342                 return (1);
00343         }
00344         memcpy(op, input_line, len);
00345 
00346         return (0);
00347 }

Generated on Sun Dec 25 12:14:25 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2