You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

unaccent.c 9.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. /*-------------------------------------------------------------------------
  2. *
  3. * unaccent.c
  4. * Text search unaccent dictionary
  5. *
  6. * Copyright (c) 2009-2019, PostgreSQL Global Development Group
  7. *
  8. * IDENTIFICATION
  9. * contrib/unaccent/unaccent.c
  10. *
  11. *-------------------------------------------------------------------------
  12. */
  13. #include "postgres.h"
  14. #include "catalog/namespace.h"
  15. #include "catalog/pg_ts_dict.h"
  16. #include "commands/defrem.h"
  17. #include "lib/stringinfo.h"
  18. #include "tsearch/ts_cache.h"
  19. #include "tsearch/ts_locale.h"
  20. #include "tsearch/ts_public.h"
  21. #include "utils/builtins.h"
  22. #include "utils/lsyscache.h"
  23. #include "utils/regproc.h"
  24. #include "utils/syscache.h"
  25. PG_MODULE_MAGIC;
  26. /*
  27. * An unaccent dictionary uses a trie to find a string to replace. Each node
  28. * of the trie is an array of 256 TrieChar structs; the N-th element of the
  29. * array corresponds to next byte value N. That element can contain both a
  30. * replacement string (to be used if the source string ends with this byte)
  31. * and a link to another trie node (to be followed if there are more bytes).
  32. *
  33. * Note that the trie search logic pays no attention to multibyte character
  34. * boundaries. This is OK as long as both the data entered into the trie and
  35. * the data we're trying to look up are validly encoded; no partial-character
  36. * matches will occur.
  37. */
  38. typedef struct TrieChar
  39. {
  40. struct TrieChar *nextChar;
  41. char *replaceTo;
  42. int replacelen;
  43. } TrieChar;
  44. /*
  45. * placeChar - put str into trie's structure, byte by byte.
  46. *
  47. * If node is NULL, we need to make a new node, which will be returned;
  48. * otherwise the return value is the same as node.
  49. */
  50. static TrieChar *
  51. placeChar(TrieChar *node, const unsigned char *str, int lenstr,
  52. const char *replaceTo, int replacelen)
  53. {
  54. TrieChar *curnode;
  55. if (!node)
  56. node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
  57. Assert(lenstr > 0); /* else str[0] doesn't exist */
  58. curnode = node + *str;
  59. if (lenstr <= 1)
  60. {
  61. if (curnode->replaceTo)
  62. ereport(WARNING,
  63. (errcode(ERRCODE_CONFIG_FILE_ERROR),
  64. errmsg("duplicate source strings, first one will be used")));
  65. else
  66. {
  67. curnode->replacelen = replacelen;
  68. curnode->replaceTo = (char *) palloc(replacelen);
  69. memcpy(curnode->replaceTo, replaceTo, replacelen);
  70. }
  71. }
  72. else
  73. {
  74. curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
  75. replaceTo, replacelen);
  76. }
  77. return node;
  78. }
  79. /*
  80. * initTrie - create trie from file.
  81. *
  82. * Function converts UTF8-encoded file into current encoding.
  83. */
  84. static TrieChar *
  85. initTrie(const char *filename)
  86. {
  87. TrieChar *volatile rootTrie = NULL;
  88. MemoryContext ccxt = CurrentMemoryContext;
  89. tsearch_readline_state trst;
  90. volatile bool skip;
  91. filename = get_tsearch_config_filename(filename, "rules");
  92. if (!tsearch_readline_begin(&trst, filename))
  93. ereport(ERROR,
  94. (errcode(ERRCODE_CONFIG_FILE_ERROR),
  95. errmsg("could not open unaccent file \"%s\": %m",
  96. filename)));
  97. do
  98. {
  99. /*
  100. * pg_do_encoding_conversion() (called by tsearch_readline()) will
  101. * emit exception if it finds untranslatable characters in current
  102. * locale. We just skip such lines, continuing with the next.
  103. */
  104. skip = true;
  105. PG_TRY();
  106. {
  107. char *line;
  108. while ((line = tsearch_readline(&trst)) != NULL)
  109. {
  110. /*----------
  111. * The format of each line must be "src" or "src trg", where
  112. * src and trg are sequences of one or more non-whitespace
  113. * characters, separated by whitespace. Whitespace at start
  114. * or end of line is ignored. If trg is omitted, an empty
  115. * string is used as the replacement.
  116. *
  117. * We use a simple state machine, with states
  118. * 0 initial (before src)
  119. * 1 in src
  120. * 2 in whitespace after src
  121. * 3 in trg
  122. * 4 in whitespace after trg
  123. * -1 syntax error detected
  124. *----------
  125. */
  126. int state;
  127. char *ptr;
  128. char *src = NULL;
  129. char *trg = NULL;
  130. int ptrlen;
  131. int srclen = 0;
  132. int trglen = 0;
  133. state = 0;
  134. for (ptr = line; *ptr; ptr += ptrlen)
  135. {
  136. ptrlen = pg_mblen(ptr);
  137. /* ignore whitespace, but end src or trg */
  138. if (t_isspace(ptr))
  139. {
  140. if (state == 1)
  141. state = 2;
  142. else if (state == 3)
  143. state = 4;
  144. continue;
  145. }
  146. switch (state)
  147. {
  148. case 0:
  149. /* start of src */
  150. src = ptr;
  151. srclen = ptrlen;
  152. state = 1;
  153. break;
  154. case 1:
  155. /* continue src */
  156. srclen += ptrlen;
  157. break;
  158. case 2:
  159. /* start of trg */
  160. trg = ptr;
  161. trglen = ptrlen;
  162. state = 3;
  163. break;
  164. case 3:
  165. /* continue trg */
  166. trglen += ptrlen;
  167. break;
  168. default:
  169. /* bogus line format */
  170. state = -1;
  171. break;
  172. }
  173. }
  174. if (state == 1 || state == 2)
  175. {
  176. /* trg was omitted, so use "" */
  177. trg = "";
  178. trglen = 0;
  179. }
  180. if (state > 0)
  181. rootTrie = placeChar(rootTrie,
  182. (unsigned char *) src, srclen,
  183. trg, trglen);
  184. else if (state < 0)
  185. ereport(WARNING,
  186. (errcode(ERRCODE_CONFIG_FILE_ERROR),
  187. errmsg("invalid syntax: more than two strings in unaccent rule")));
  188. pfree(line);
  189. }
  190. skip = false;
  191. }
  192. PG_CATCH();
  193. {
  194. ErrorData *errdata;
  195. MemoryContext ecxt;
  196. ecxt = MemoryContextSwitchTo(ccxt);
  197. errdata = CopyErrorData();
  198. if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
  199. {
  200. FlushErrorState();
  201. }
  202. else
  203. {
  204. MemoryContextSwitchTo(ecxt);
  205. PG_RE_THROW();
  206. }
  207. }
  208. PG_END_TRY();
  209. }
  210. while (skip);
  211. tsearch_readline_end(&trst);
  212. return rootTrie;
  213. }
  214. /*
  215. * findReplaceTo - find longest possible match in trie
  216. *
  217. * On success, returns pointer to ending subnode, plus length of matched
  218. * source string in *p_matchlen. On failure, returns NULL.
  219. */
  220. static TrieChar *
  221. findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
  222. int *p_matchlen)
  223. {
  224. TrieChar *result = NULL;
  225. int matchlen = 0;
  226. *p_matchlen = 0; /* prevent uninitialized-variable warnings */
  227. while (node && matchlen < srclen)
  228. {
  229. node = node + src[matchlen];
  230. matchlen++;
  231. if (node->replaceTo)
  232. {
  233. result = node;
  234. *p_matchlen = matchlen;
  235. }
  236. node = node->nextChar;
  237. }
  238. return result;
  239. }
  240. PG_FUNCTION_INFO_V1(unaccent_init);
  241. Datum
  242. unaccent_init(PG_FUNCTION_ARGS)
  243. {
  244. List *dictoptions = (List *) PG_GETARG_POINTER(0);
  245. TrieChar *rootTrie = NULL;
  246. bool fileloaded = false;
  247. ListCell *l;
  248. foreach(l, dictoptions)
  249. {
  250. DefElem *defel = (DefElem *) lfirst(l);
  251. if (strcmp(defel->defname, "rules") == 0)
  252. {
  253. if (fileloaded)
  254. ereport(ERROR,
  255. (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  256. errmsg("multiple Rules parameters")));
  257. rootTrie = initTrie(defGetString(defel));
  258. fileloaded = true;
  259. }
  260. else
  261. {
  262. ereport(ERROR,
  263. (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  264. errmsg("unrecognized Unaccent parameter: \"%s\"",
  265. defel->defname)));
  266. }
  267. }
  268. if (!fileloaded)
  269. {
  270. ereport(ERROR,
  271. (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  272. errmsg("missing Rules parameter")));
  273. }
  274. PG_RETURN_POINTER(rootTrie);
  275. }
  276. PG_FUNCTION_INFO_V1(unaccent_lexize);
  277. Datum
  278. unaccent_lexize(PG_FUNCTION_ARGS)
  279. {
  280. TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
  281. char *srcchar = (char *) PG_GETARG_POINTER(1);
  282. int32 len = PG_GETARG_INT32(2);
  283. char *srcstart = srcchar;
  284. TSLexeme *res;
  285. StringInfoData buf;
  286. /* we allocate storage for the buffer only if needed */
  287. buf.data = NULL;
  288. while (len > 0)
  289. {
  290. TrieChar *node;
  291. int matchlen;
  292. node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
  293. &matchlen);
  294. if (node && node->replaceTo)
  295. {
  296. if (buf.data == NULL)
  297. {
  298. /* initialize buffer */
  299. initStringInfo(&buf);
  300. /* insert any data we already skipped over */
  301. if (srcchar != srcstart)
  302. appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
  303. }
  304. appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
  305. }
  306. else
  307. {
  308. matchlen = pg_mblen(srcchar);
  309. if (buf.data != NULL)
  310. appendBinaryStringInfo(&buf, srcchar, matchlen);
  311. }
  312. srcchar += matchlen;
  313. len -= matchlen;
  314. }
  315. /* return a result only if we made at least one substitution */
  316. if (buf.data != NULL)
  317. {
  318. res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
  319. res->lexeme = buf.data;
  320. res->flags = TSL_FILTER;
  321. }
  322. else
  323. res = NULL;
  324. PG_RETURN_POINTER(res);
  325. }
  326. /*
  327. * Function-like wrapper for dictionary
  328. */
  329. PG_FUNCTION_INFO_V1(unaccent_dict);
  330. Datum
  331. unaccent_dict(PG_FUNCTION_ARGS)
  332. {
  333. text *str;
  334. int strArg;
  335. Oid dictOid;
  336. TSDictionaryCacheEntry *dict;
  337. TSLexeme *res;
  338. if (PG_NARGS() == 1)
  339. {
  340. /*
  341. * Use the "unaccent" dictionary that is in the same schema that this
  342. * function is in.
  343. */
  344. Oid procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
  345. const char *dictname = "unaccent";
  346. dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
  347. PointerGetDatum(dictname),
  348. ObjectIdGetDatum(procnspid));
  349. if (!OidIsValid(dictOid))
  350. ereport(ERROR,
  351. (errcode(ERRCODE_UNDEFINED_OBJECT),
  352. errmsg("text search dictionary \"%s.%s\" does not exist",
  353. get_namespace_name(procnspid), dictname)));
  354. strArg = 0;
  355. }
  356. else
  357. {
  358. dictOid = PG_GETARG_OID(0);
  359. strArg = 1;
  360. }
  361. str = PG_GETARG_TEXT_PP(strArg);
  362. dict = lookup_ts_dictionary_cache(dictOid);
  363. res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
  364. PointerGetDatum(dict->dictData),
  365. PointerGetDatum(VARDATA_ANY(str)),
  366. Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
  367. PointerGetDatum(NULL)));
  368. PG_FREE_IF_COPY(str, strArg);
  369. if (res == NULL)
  370. {
  371. PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
  372. }
  373. else if (res->lexeme == NULL)
  374. {
  375. pfree(res);
  376. PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
  377. }
  378. else
  379. {
  380. text *txt = cstring_to_text(res->lexeme);
  381. pfree(res->lexeme);
  382. pfree(res);
  383. PG_RETURN_TEXT_P(txt);
  384. }
  385. }