GnuCash  2.6.99
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
import-parse.c
1 /*
2  * import-parse.c -- a generic "parser" API for importers.. Allows importers
3  * to parse dates and numbers, and provides a UI to ask for users to
4  * resolve ambiguities.
5  *
6  * Created by: Derek Atkins <[email protected]>
7  * Copyright (c) 2003 Derek Atkins <[email protected]>
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 2 of
12  * the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, contact:
21  *
22  * Free Software Foundation Voice: +1-617-542-5942
23  * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652
24  * Boston, MA 02110-1301, USA [email protected]
25  */
26 
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 
31 #include <glib.h>
32 #include <string.h>
33 
34 /* For regex */
35 #include <sys/types.h>
36 #include <regex.h>
37 
38 #include "gnc-engine.h"
39 #include "gnc-ui-util.h"
40 
41 #include "import-parse.h"
42 
43 static QofLogModule log_module = GNC_MOD_IMPORT;
44 
45 /* numeric regular expressions */
46 static regex_t decimal_radix_regex;
47 static regex_t comma_radix_regex;
48 
49 /* date regular expressions */
50 static regex_t date_regex;
51 static regex_t date_mdy_regex;
52 static regex_t date_ymd_regex;
53 
54 static gboolean regex_compiled = FALSE;
55 
56 static void
57 compile_regex(void)
58 {
59  int flags = REG_EXTENDED;
60 
61  /* compile the numeric regular expressions */
62  regcomp(&decimal_radix_regex,
63  "^ *\\$?[+-]?\\$?[0-9]+ *$|^ *\\$?[+-]?\\$?[0-9]?[0-9]?[0-9]?(,[0-9][0-9][0-9])*(\\.[0-9]*)? *$|^ *\\$?[+-]?\\$?[0-9]+\\.[0-9]* *$", flags);
64  regcomp(&comma_radix_regex,
65  "^ *\\$?[+-]?\\$?[0-9]+ *$|^ *\\$?[+-]?\\$?[0-9]?[0-9]?[0-9]?(\\.[0-9][0-9][0-9])*(,[0-9]*)? *$|^ *\\$?[+-]?\\$?[0-9]+,[0-9]* *$", flags);
66 
67  /* compile the date-parsing regular expressions */
68  regcomp(&date_regex,
69  "^ *([0-9]+) *[-/.'] *([0-9]+) *[-/.'] *([0-9]+).*$|^ *([0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]).*$", flags);
70  regcomp(&date_mdy_regex, "([0-9][0-9])([0-9][0-9])([0-9][0-9][0-9][0-9])", flags);
71  regcomp(&date_ymd_regex, "([0-9][0-9][0-9][0-9])([0-9][0-9])([0-9][0-9])", flags);
72 
73  regex_compiled = TRUE;
74 }
75 
76 static gint
77 my_strntol(const char *str, int len)
78 {
79  gint res = 0;
80 
81  g_return_val_if_fail(str, 0);
82  g_return_val_if_fail(len, 0);
83 
84  while (len--)
85  {
86 
87  if (*str < '0' || *str > '9')
88  {
89  str++;
90  continue;
91  }
92 
93  res *= 10;
94  res += *(str++) - '0';
95  }
96  return res;
97 }
98 
99 /*
100  * based on a trio match (matches in spaces 1, 2, and 3), and a list
101  * of possible date formats, return the list of formats that this string
102  * could actually be.
103  */
104 static GncImportFormat
105 check_date_format(const char * str, regmatch_t *match, GncImportFormat fmts)
106 {
107  GncImportFormat res = 0;
108  int len0 = 0, len1 = 0, len2 = 0;
109  int val0 = 0, val1 = 0, val2 = 0;
110 
111  g_return_val_if_fail(match, res);
112  g_return_val_if_fail(fmts, res);
113 
114  /* Compute the lengths */
115  len0 = match[1].rm_eo - match[1].rm_so;
116  len1 = match[2].rm_eo - match[2].rm_so;
117  len2 = match[3].rm_eo - match[3].rm_so;
118 
119  /* compute the numeric values */
120  val0 = my_strntol(str + match[1].rm_so, len0);
121  val1 = my_strntol(str + match[2].rm_so, len1);
122  val2 = my_strntol(str + match[3].rm_so, len2);
123 
124  /* Filter out the possibilities. Hopefully only one will remain */
125 
126  if (val0 > 12) import_clear_flag(fmts, GNCIF_DATE_MDY);
127  if (val0 > 31) import_clear_flag(fmts, GNCIF_DATE_DMY);
128  if (val0 < 1)
129  {
130  import_clear_flag(fmts, GNCIF_DATE_DMY);
131  import_clear_flag(fmts, GNCIF_DATE_MDY);
132  }
133 
134  if (val1 > 12)
135  {
136  import_clear_flag(fmts, GNCIF_DATE_DMY);
137  import_clear_flag(fmts, GNCIF_DATE_YMD);
138  }
139  if (val1 > 31)
140  {
141  import_clear_flag(fmts, GNCIF_DATE_MDY);
142  import_clear_flag(fmts, GNCIF_DATE_YDM);
143  }
144 
145  if (val2 > 12) import_clear_flag(fmts, GNCIF_DATE_YDM);
146  if (val2 > 31) import_clear_flag(fmts, GNCIF_DATE_YMD);
147  if (val2 < 1)
148  {
149  import_clear_flag(fmts, GNCIF_DATE_YMD);
150  import_clear_flag(fmts, GNCIF_DATE_YDM);
151  }
152 
153  /* if we've got a 4-character year, make sure the value is greater
154  * than 1930 and less than 2100. XXX: be sure to fix this by 2100!
155  */
156  if (len0 == 4 && (val0 < 1930 || val0 > 2100))
157  {
158  import_clear_flag(fmts, GNCIF_DATE_YMD);
159  import_clear_flag(fmts, GNCIF_DATE_YDM);
160  }
161  if (len2 == 4 && (val2 < 1930 || val2 > 2100))
162  {
163  import_clear_flag(fmts, GNCIF_DATE_MDY);
164  import_clear_flag(fmts, GNCIF_DATE_DMY);
165  }
166 
167  /* If the first string has a length of only 1, then it is definitely
168  * not a year (although it could be a month or day).
169  */
170  if (len0 == 1)
171  {
172  import_clear_flag(fmts, GNCIF_DATE_YMD);
173  import_clear_flag(fmts, GNCIF_DATE_YDM);
174  }
175 
176  return fmts;
177 }
178 
179 GncImportFormat
180 gnc_import_test_numeric(const char* str, GncImportFormat fmts)
181 {
182  GncImportFormat res = 0;
183 
184  g_return_val_if_fail(str, fmts);
185 
186  if (!regex_compiled)
187  compile_regex();
188 
189  if ((fmts & GNCIF_NUM_PERIOD) && !regexec(&decimal_radix_regex, str, 0, NULL, 0))
190  res |= GNCIF_NUM_PERIOD;
191 
192  if ((fmts & GNCIF_NUM_COMMA) && !regexec(&comma_radix_regex, str, 0, NULL, 0))
193  res |= GNCIF_NUM_COMMA;
194 
195  return res;
196 }
197 
198 
199 GncImportFormat
200 gnc_import_test_date(const char* str, GncImportFormat fmts)
201 {
202  regmatch_t match[5];
203  GncImportFormat res = 0;
204 
205  g_return_val_if_fail(str, fmts);
206  g_return_val_if_fail(strlen(str) > 1, fmts);
207 
208  if (!regex_compiled)
209  compile_regex();
210 
211  if (!regexec(&date_regex, str, 5, match, 0))
212  {
213  if (match[1].rm_so != -1)
214  res = check_date_format(str, match, fmts);
215  else
216  {
217  /* Hmm, it matches XXXXXXXX, but is this YYYYxxxx or xxxxYYYY?
218  * let's try both ways and let the parser check that YYYY is
219  * valid.
220  */
221  char temp[9];
222 
223  g_return_val_if_fail(match[4].rm_so != -1, fmts);
224  g_return_val_if_fail(match[4].rm_eo - match[4].rm_so == 8, fmts);
225 
226  /* make a temp copy of the XXXXXXXX string */
227  strncpy(temp, str + match[4].rm_so, 8);
228  temp[8] = '\0';
229 
230  /* then check it against the ymd or mdy formats, as necessary */
231  if (((fmts & GNCIF_DATE_YDM) || (fmts & GNCIF_DATE_YMD)) &&
232  !regexec(&date_ymd_regex, temp, 4, match, 0))
233  res |= check_date_format(temp, match, fmts);
234 
235  if (((fmts & GNCIF_DATE_DMY) || (fmts & GNCIF_DATE_MDY)) &&
236  !regexec(&date_mdy_regex, temp, 4, match, 0))
237  res |= check_date_format(temp, match, fmts);
238  }
239  }
240 
241  return res;
242 }
243 
244 gboolean
245 gnc_import_parse_numeric(const char* str, GncImportFormat fmt, gnc_numeric *val)
246 {
247  g_return_val_if_fail(str, FALSE);
248  g_return_val_if_fail(val, FALSE);
249  g_return_val_if_fail(fmt, FALSE);
250  g_return_val_if_fail(!(fmt & (fmt - 1)), FALSE);
251 
252  switch (fmt)
253  {
254  case GNCIF_NUM_PERIOD:
255  return xaccParseAmountExtended(str, TRUE, '-', '.', ',', NULL, "$+",
256  val, NULL);
257  case GNCIF_NUM_COMMA:
258  return xaccParseAmountExtended(str, TRUE, '-', ',', '.', NULL, "$+",
259  val, NULL);
260  default:
261  PERR("invalid format: %d", fmt);
262  return FALSE;
263  }
264 }
265 
266 /* Handle y2k fixes, etc.
267  * obtaining the year "00", "2000", and "19100" all mean the same thing.
268  * output is an integer representing the year in the C.E.
269  */
270 static int
271 fix_year(int y)
272 {
273  /* two-digit numbers less than "70" are interpretted to be post-2000. */
274  if (y < 70)
275  return (y + 2000);
276 
277  /* fix a common bug in printing post-2000 dates as 19100, etc. */
278  if (y > 19000)
279  return (1900 + (y - 19000));
280 
281  /* At this point we just want to make sure that this is a real date.
282  * y _should_ be a 'unix year' (which is the number of years since
283  * 1900), but it _COULD_ be a full date (1999, 2001, etc.). At some
284  * point in the future we can't tell the difference, but are we really
285  * going to care if this code fails in 3802?
286  */
287  if (y < 1902)
288  return (y + 1900);
289 
290  /* y is good as it is */
291  return y;
292 }
293 
294 gboolean
295 gnc_import_parse_date(const char *str, GncImportFormat fmt, Timespec *val)
296 {
297  regmatch_t match[5];
298  char temp[9];
299  const char *datestr;
300 
301  int v0 = 0, v1 = 0, v2 = 0;
302  int m = 0, d = 0, y = 0;
303 
304  g_return_val_if_fail(str, FALSE);
305  g_return_val_if_fail(val, FALSE);
306  g_return_val_if_fail(fmt, FALSE);
307  g_return_val_if_fail(!(fmt & (fmt - 1)), FALSE);
308 
309  if (!regexec(&date_regex, str, 5, match, 0))
310  {
311  if (match[1].rm_so != -1)
312  datestr = str;
313  else
314  {
315  /* date is of the form XXXXXXX; save it to a temp string and
316  * split it based on the format, either YYYYaabb or aabbYYYY
317  */
318  g_return_val_if_fail(match[4].rm_so != -1, FALSE);
319  g_return_val_if_fail(match[4].rm_eo - match[4].rm_so == 8, FALSE);
320 
321  strncpy(temp, str + match[4].rm_so, 8);
322  temp[8] = '\0';
323 
324  switch (fmt)
325  {
326  case GNCIF_DATE_DMY:
327  case GNCIF_DATE_MDY:
328  g_return_val_if_fail(!regexec(&date_mdy_regex, temp, 4, match, 0), FALSE);
329  break;
330  case GNCIF_DATE_YMD:
331  case GNCIF_DATE_YDM:
332  g_return_val_if_fail(!regexec(&date_ymd_regex, temp, 4, match, 0), FALSE);
333  break;
334  default:
335  PERR("Invalid date format provided: %d", fmt);
336  return FALSE;
337  }
338  datestr = temp;
339  }
340 
341  /* datestr points to the date string, and match[123] contains the matches. */
342 
343  if (match[1].rm_so == -1 || match[2].rm_so == -1 || match[3].rm_so == -1)
344  {
345  PERR("can't interpret date %s", str);
346  return FALSE;
347  }
348 
349  /* grab the numerics */
350  v0 = my_strntol(datestr + match[1].rm_so, match[1].rm_eo - match[1].rm_so);
351  v1 = my_strntol(datestr + match[2].rm_so, match[2].rm_eo - match[2].rm_so);
352  v2 = my_strntol(datestr + match[3].rm_so, match[3].rm_eo - match[3].rm_so);
353 
354  switch (fmt)
355  {
356  case GNCIF_DATE_DMY:
357  if (v0 > 0 && v0 <= 31 && v1 > 0 && v1 <= 12 && v2 > 0)
358  {
359  d = v0;
360  m = v1;
361  y = v2;
362  }
363  else
364  PERR("format is d/m/y but date is %s", str);
365  break;
366 
367  case GNCIF_DATE_MDY:
368  if (v0 > 0 && v0 <= 12 && v1 > 0 && v1 <= 31 && v2 > 0)
369  {
370  m = v0;
371  d = v1;
372  y = v2;
373  }
374  else
375  PERR("format is m/d/y but date is %s", str);
376  break;
377 
378  case GNCIF_DATE_YMD:
379  if (v0 > 0 && v1 > 0 && v1 <= 12 && v2 > 0 && v2 <= 31)
380  {
381  y = v0;
382  m = v1;
383  d = v2;
384  }
385  else
386  PERR("format is y/m/d but date is %s", str);
387  break;
388 
389  case GNCIF_DATE_YDM:
390  if (v0 > 0 && v1 > 0 && v1 <= 31 && v2 > 0 && v2 <= 12)
391  {
392  y = v0;
393  d = v1;
394  m = v2;
395  }
396  else
397  PERR("format is y/d/m but date is %s", str);
398  break;
399 
400  default:
401  PERR("invalid date format: %d", fmt);
402  }
403 
404  if (!m || !d || !y)
405  return FALSE;
406 
407  y = fix_year(y);
408  *val = gnc_dmy2timespec(d, m, y);
409  return TRUE;
410  }
411 
412  return FALSE;
413 }
utility functions for the GnuCash UI
Use a 64-bit unsigned int timespec.
Definition: gnc-date.h:299
Timespec gnc_dmy2timespec(gint day, gint month, gint year)
#define PERR(format, args...)
Definition: qoflog.h:237
All type declarations for the whole Gnucash engine.
const gchar * QofLogModule
Definition: qofid.h:89