You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

fuzzystrmatch.c 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794
  1. /*
  2. * fuzzystrmatch.c
  3. *
  4. * Functions for "fuzzy" comparison of strings
  5. *
  6. * Joe Conway <mail@joeconway.com>
  7. *
  8. * contrib/fuzzystrmatch/fuzzystrmatch.c
  9. * Copyright (c) 2001-2019, PostgreSQL Global Development Group
  10. * ALL RIGHTS RESERVED;
  11. *
  12. * metaphone()
  13. * -----------
  14. * Modified for PostgreSQL by Joe Conway.
  15. * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
  16. * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
  17. * Metaphone was originally created by Lawrence Philips and presented in article
  18. * in "Computer Language" December 1990 issue.
  19. *
  20. * Permission to use, copy, modify, and distribute this software and its
  21. * documentation for any purpose, without fee, and without a written agreement
  22. * is hereby granted, provided that the above copyright notice and this
  23. * paragraph and the following two paragraphs appear in all copies.
  24. *
  25. * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
  26. * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
  27. * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
  28. * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
  29. * POSSIBILITY OF SUCH DAMAGE.
  30. *
  31. * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
  32. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  33. * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
  34. * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
  35. * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  36. *
  37. */
  38. #include "postgres.h"
  39. #include <ctype.h>
  40. #include "mb/pg_wchar.h"
  41. #include "utils/builtins.h"
  42. #include "utils/varlena.h"
  43. PG_MODULE_MAGIC;
  44. /*
  45. * Soundex
  46. */
  47. static void _soundex(const char *instr, char *outstr);
  48. #define SOUNDEX_LEN 4
  49. /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
  50. static const char *soundex_table = "01230120022455012623010202";
  51. static char
  52. soundex_code(char letter)
  53. {
  54. letter = toupper((unsigned char) letter);
  55. /* Defend against non-ASCII letters */
  56. if (letter >= 'A' && letter <= 'Z')
  57. return soundex_table[letter - 'A'];
  58. return letter;
  59. }
  60. /*
  61. * Metaphone
  62. */
  63. #define MAX_METAPHONE_STRLEN 255
  64. /*
  65. * Original code by Michael G Schwern starts here.
  66. * Code slightly modified for use as PostgreSQL function.
  67. */
  68. /**************************************************************************
  69. metaphone -- Breaks english phrases down into their phonemes.
  70. Input
  71. word -- An english word to be phonized
  72. max_phonemes -- How many phonemes to calculate. If 0, then it
  73. will phonize the entire phrase.
  74. phoned_word -- The final phonized word. (We'll allocate the
  75. memory.)
  76. Output
  77. error -- A simple error flag, returns true or false
  78. NOTES: ALL non-alpha characters are ignored, this includes whitespace,
  79. although non-alpha characters will break up phonemes.
  80. ****************************************************************************/
  81. /* I add modifications to the traditional metaphone algorithm that you
  82. might find in books. Define this if you want metaphone to behave
  83. traditionally */
  84. #undef USE_TRADITIONAL_METAPHONE
  85. /* Special encodings */
  86. #define SH 'X'
  87. #define TH '0'
  88. static char Lookahead(char *word, int how_far);
  89. static void _metaphone(char *word, int max_phonemes, char **phoned_word);
  90. /* Metachar.h ... little bits about characters for metaphone */
  91. /*-- Character encoding array & accessing macros --*/
  92. /* Stolen directly out of the book... */
  93. static const char _codes[26] = {
  94. 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
  95. /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
  96. };
  97. static int
  98. getcode(char c)
  99. {
  100. if (isalpha((unsigned char) c))
  101. {
  102. c = toupper((unsigned char) c);
  103. /* Defend against non-ASCII letters */
  104. if (c >= 'A' && c <= 'Z')
  105. return _codes[c - 'A'];
  106. }
  107. return 0;
  108. }
  109. #define isvowel(c) (getcode(c) & 1) /* AEIOU */
  110. /* These letters are passed through unchanged */
  111. #define NOCHANGE(c) (getcode(c) & 2) /* FJMNR */
  112. /* These form diphthongs when preceding H */
  113. #define AFFECTH(c) (getcode(c) & 4) /* CGPST */
  114. /* These make C and G soft */
  115. #define MAKESOFT(c) (getcode(c) & 8) /* EIY */
  116. /* These prevent GH from becoming F */
  117. #define NOGHTOF(c) (getcode(c) & 16) /* BDH */
  118. PG_FUNCTION_INFO_V1(levenshtein_with_costs);
  119. Datum
  120. levenshtein_with_costs(PG_FUNCTION_ARGS)
  121. {
  122. text *src = PG_GETARG_TEXT_PP(0);
  123. text *dst = PG_GETARG_TEXT_PP(1);
  124. int ins_c = PG_GETARG_INT32(2);
  125. int del_c = PG_GETARG_INT32(3);
  126. int sub_c = PG_GETARG_INT32(4);
  127. const char *s_data;
  128. const char *t_data;
  129. int s_bytes,
  130. t_bytes;
  131. /* Extract a pointer to the actual character data */
  132. s_data = VARDATA_ANY(src);
  133. t_data = VARDATA_ANY(dst);
  134. /* Determine length of each string in bytes */
  135. s_bytes = VARSIZE_ANY_EXHDR(src);
  136. t_bytes = VARSIZE_ANY_EXHDR(dst);
  137. PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
  138. ins_c, del_c, sub_c, false));
  139. }
  140. PG_FUNCTION_INFO_V1(levenshtein);
  141. Datum
  142. levenshtein(PG_FUNCTION_ARGS)
  143. {
  144. text *src = PG_GETARG_TEXT_PP(0);
  145. text *dst = PG_GETARG_TEXT_PP(1);
  146. const char *s_data;
  147. const char *t_data;
  148. int s_bytes,
  149. t_bytes;
  150. /* Extract a pointer to the actual character data */
  151. s_data = VARDATA_ANY(src);
  152. t_data = VARDATA_ANY(dst);
  153. /* Determine length of each string in bytes */
  154. s_bytes = VARSIZE_ANY_EXHDR(src);
  155. t_bytes = VARSIZE_ANY_EXHDR(dst);
  156. PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
  157. 1, 1, 1, false));
  158. }
  159. PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
  160. Datum
  161. levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
  162. {
  163. text *src = PG_GETARG_TEXT_PP(0);
  164. text *dst = PG_GETARG_TEXT_PP(1);
  165. int ins_c = PG_GETARG_INT32(2);
  166. int del_c = PG_GETARG_INT32(3);
  167. int sub_c = PG_GETARG_INT32(4);
  168. int max_d = PG_GETARG_INT32(5);
  169. const char *s_data;
  170. const char *t_data;
  171. int s_bytes,
  172. t_bytes;
  173. /* Extract a pointer to the actual character data */
  174. s_data = VARDATA_ANY(src);
  175. t_data = VARDATA_ANY(dst);
  176. /* Determine length of each string in bytes */
  177. s_bytes = VARSIZE_ANY_EXHDR(src);
  178. t_bytes = VARSIZE_ANY_EXHDR(dst);
  179. PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
  180. t_data, t_bytes,
  181. ins_c, del_c, sub_c,
  182. max_d, false));
  183. }
  184. PG_FUNCTION_INFO_V1(levenshtein_less_equal);
  185. Datum
  186. levenshtein_less_equal(PG_FUNCTION_ARGS)
  187. {
  188. text *src = PG_GETARG_TEXT_PP(0);
  189. text *dst = PG_GETARG_TEXT_PP(1);
  190. int max_d = PG_GETARG_INT32(2);
  191. const char *s_data;
  192. const char *t_data;
  193. int s_bytes,
  194. t_bytes;
  195. /* Extract a pointer to the actual character data */
  196. s_data = VARDATA_ANY(src);
  197. t_data = VARDATA_ANY(dst);
  198. /* Determine length of each string in bytes */
  199. s_bytes = VARSIZE_ANY_EXHDR(src);
  200. t_bytes = VARSIZE_ANY_EXHDR(dst);
  201. PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
  202. t_data, t_bytes,
  203. 1, 1, 1,
  204. max_d, false));
  205. }
  206. /*
  207. * Calculates the metaphone of an input string.
  208. * Returns number of characters requested
  209. * (suggested value is 4)
  210. */
  211. PG_FUNCTION_INFO_V1(metaphone);
  212. Datum
  213. metaphone(PG_FUNCTION_ARGS)
  214. {
  215. char *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
  216. size_t str_i_len = strlen(str_i);
  217. int reqlen;
  218. char *metaph;
  219. /* return an empty string if we receive one */
  220. if (!(str_i_len > 0))
  221. PG_RETURN_TEXT_P(cstring_to_text(""));
  222. if (str_i_len > MAX_METAPHONE_STRLEN)
  223. ereport(ERROR,
  224. (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  225. errmsg("argument exceeds the maximum length of %d bytes",
  226. MAX_METAPHONE_STRLEN)));
  227. reqlen = PG_GETARG_INT32(1);
  228. if (reqlen > MAX_METAPHONE_STRLEN)
  229. ereport(ERROR,
  230. (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  231. errmsg("output exceeds the maximum length of %d bytes",
  232. MAX_METAPHONE_STRLEN)));
  233. if (!(reqlen > 0))
  234. ereport(ERROR,
  235. (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
  236. errmsg("output cannot be empty string")));
  237. _metaphone(str_i, reqlen, &metaph);
  238. PG_RETURN_TEXT_P(cstring_to_text(metaph));
  239. }
  240. /*
  241. * Original code by Michael G Schwern starts here.
  242. * Code slightly modified for use as PostgreSQL
  243. * function (palloc, etc).
  244. */
  245. /* I suppose I could have been using a character pointer instead of
  246. * accessing the array directly... */
  247. /* Look at the next letter in the word */
  248. #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
  249. /* Look at the current letter in the word */
  250. #define Curr_Letter (toupper((unsigned char) word[w_idx]))
  251. /* Go N letters back. */
  252. #define Look_Back_Letter(n) \
  253. (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
  254. /* Previous letter. I dunno, should this return null on failure? */
  255. #define Prev_Letter (Look_Back_Letter(1))
  256. /* Look two letters down. It makes sure you don't walk off the string. */
  257. #define After_Next_Letter \
  258. (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
  259. #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
  260. /* Allows us to safely look ahead an arbitrary # of letters */
  261. /* I probably could have just used strlen... */
  262. static char
  263. Lookahead(char *word, int how_far)
  264. {
  265. char letter_ahead = '\0'; /* null by default */
  266. int idx;
  267. for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
  268. /* Edge forward in the string... */
  269. letter_ahead = word[idx]; /* idx will be either == to how_far or at the
  270. * end of the string */
  271. return letter_ahead;
  272. }
  273. /* phonize one letter */
  274. #define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0)
  275. /* Slap a null character on the end of the phoned word */
  276. #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
  277. /* How long is the phoned word? */
  278. #define Phone_Len (p_idx)
  279. /* Note is a letter is a 'break' in the word */
  280. #define Isbreak(c) (!isalpha((unsigned char) (c)))
  281. static void
  282. _metaphone(char *word, /* IN */
  283. int max_phonemes,
  284. char **phoned_word) /* OUT */
  285. {
  286. int w_idx = 0; /* point in the phonization we're at. */
  287. int p_idx = 0; /* end of the phoned phrase */
  288. /*-- Parameter checks --*/
  289. /*
  290. * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
  291. */
  292. /* Negative phoneme length is meaningless */
  293. if (!(max_phonemes > 0))
  294. /* internal error */
  295. elog(ERROR, "metaphone: Requested output length must be > 0");
  296. /* Empty/null string is meaningless */
  297. if ((word == NULL) || !(strlen(word) > 0))
  298. /* internal error */
  299. elog(ERROR, "metaphone: Input string length must be > 0");
  300. /*-- Allocate memory for our phoned_phrase --*/
  301. if (max_phonemes == 0)
  302. { /* Assume largest possible */
  303. *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
  304. }
  305. else
  306. {
  307. *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
  308. }
  309. /*-- The first phoneme has to be processed specially. --*/
  310. /* Find our first letter */
  311. for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
  312. {
  313. /* On the off chance we were given nothing but crap... */
  314. if (Curr_Letter == '\0')
  315. {
  316. End_Phoned_Word;
  317. return;
  318. }
  319. }
  320. switch (Curr_Letter)
  321. {
  322. /* AE becomes E */
  323. case 'A':
  324. if (Next_Letter == 'E')
  325. {
  326. Phonize('E');
  327. w_idx += 2;
  328. }
  329. /* Remember, preserve vowels at the beginning */
  330. else
  331. {
  332. Phonize('A');
  333. w_idx++;
  334. }
  335. break;
  336. /* [GKP]N becomes N */
  337. case 'G':
  338. case 'K':
  339. case 'P':
  340. if (Next_Letter == 'N')
  341. {
  342. Phonize('N');
  343. w_idx += 2;
  344. }
  345. break;
  346. /*
  347. * WH becomes H, WR becomes R W if followed by a vowel
  348. */
  349. case 'W':
  350. if (Next_Letter == 'H' ||
  351. Next_Letter == 'R')
  352. {
  353. Phonize(Next_Letter);
  354. w_idx += 2;
  355. }
  356. else if (isvowel(Next_Letter))
  357. {
  358. Phonize('W');
  359. w_idx += 2;
  360. }
  361. /* else ignore */
  362. break;
  363. /* X becomes S */
  364. case 'X':
  365. Phonize('S');
  366. w_idx++;
  367. break;
  368. /* Vowels are kept */
  369. /*
  370. * We did A already case 'A': case 'a':
  371. */
  372. case 'E':
  373. case 'I':
  374. case 'O':
  375. case 'U':
  376. Phonize(Curr_Letter);
  377. w_idx++;
  378. break;
  379. default:
  380. /* do nothing */
  381. break;
  382. }
  383. /* On to the metaphoning */
  384. for (; Curr_Letter != '\0' &&
  385. (max_phonemes == 0 || Phone_Len < max_phonemes);
  386. w_idx++)
  387. {
  388. /*
  389. * How many letters to skip because an earlier encoding handled
  390. * multiple letters
  391. */
  392. unsigned short int skip_letter = 0;
  393. /*
  394. * THOUGHT: It would be nice if, rather than having things like...
  395. * well, SCI. For SCI you encode the S, then have to remember to skip
  396. * the C. So the phonome SCI invades both S and C. It would be
  397. * better, IMHO, to skip the C from the S part of the encoding. Hell,
  398. * I'm trying it.
  399. */
  400. /* Ignore non-alphas */
  401. if (!isalpha((unsigned char) (Curr_Letter)))
  402. continue;
  403. /* Drop duplicates, except CC */
  404. if (Curr_Letter == Prev_Letter &&
  405. Curr_Letter != 'C')
  406. continue;
  407. switch (Curr_Letter)
  408. {
  409. /* B -> B unless in MB */
  410. case 'B':
  411. if (Prev_Letter != 'M')
  412. Phonize('B');
  413. break;
  414. /*
  415. * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
  416. * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
  417. * SCE-, -SCY- (handed in S) else K
  418. */
  419. case 'C':
  420. if (MAKESOFT(Next_Letter))
  421. { /* C[IEY] */
  422. if (After_Next_Letter == 'A' &&
  423. Next_Letter == 'I')
  424. { /* CIA */
  425. Phonize(SH);
  426. }
  427. /* SC[IEY] */
  428. else if (Prev_Letter == 'S')
  429. {
  430. /* Dropped */
  431. }
  432. else
  433. Phonize('S');
  434. }
  435. else if (Next_Letter == 'H')
  436. {
  437. #ifndef USE_TRADITIONAL_METAPHONE
  438. if (After_Next_Letter == 'R' ||
  439. Prev_Letter == 'S')
  440. { /* Christ, School */
  441. Phonize('K');
  442. }
  443. else
  444. Phonize(SH);
  445. #else
  446. Phonize(SH);
  447. #endif
  448. skip_letter++;
  449. }
  450. else
  451. Phonize('K');
  452. break;
  453. /*
  454. * J if in -DGE-, -DGI- or -DGY- else T
  455. */
  456. case 'D':
  457. if (Next_Letter == 'G' &&
  458. MAKESOFT(After_Next_Letter))
  459. {
  460. Phonize('J');
  461. skip_letter++;
  462. }
  463. else
  464. Phonize('T');
  465. break;
  466. /*
  467. * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
  468. * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
  469. * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
  470. * else K
  471. */
  472. case 'G':
  473. if (Next_Letter == 'H')
  474. {
  475. if (!(NOGHTOF(Look_Back_Letter(3)) ||
  476. Look_Back_Letter(4) == 'H'))
  477. {
  478. Phonize('F');
  479. skip_letter++;
  480. }
  481. else
  482. {
  483. /* silent */
  484. }
  485. }
  486. else if (Next_Letter == 'N')
  487. {
  488. if (Isbreak(After_Next_Letter) ||
  489. (After_Next_Letter == 'E' &&
  490. Look_Ahead_Letter(3) == 'D'))
  491. {
  492. /* dropped */
  493. }
  494. else
  495. Phonize('K');
  496. }
  497. else if (MAKESOFT(Next_Letter) &&
  498. Prev_Letter != 'G')
  499. Phonize('J');
  500. else
  501. Phonize('K');
  502. break;
  503. /* H if before a vowel and not after C,G,P,S,T */
  504. case 'H':
  505. if (isvowel(Next_Letter) &&
  506. !AFFECTH(Prev_Letter))
  507. Phonize('H');
  508. break;
  509. /*
  510. * dropped if after C else K
  511. */
  512. case 'K':
  513. if (Prev_Letter != 'C')
  514. Phonize('K');
  515. break;
  516. /*
  517. * F if before H else P
  518. */
  519. case 'P':
  520. if (Next_Letter == 'H')
  521. Phonize('F');
  522. else
  523. Phonize('P');
  524. break;
  525. /*
  526. * K
  527. */
  528. case 'Q':
  529. Phonize('K');
  530. break;
  531. /*
  532. * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
  533. */
  534. case 'S':
  535. if (Next_Letter == 'I' &&
  536. (After_Next_Letter == 'O' ||
  537. After_Next_Letter == 'A'))
  538. Phonize(SH);
  539. else if (Next_Letter == 'H')
  540. {
  541. Phonize(SH);
  542. skip_letter++;
  543. }
  544. #ifndef USE_TRADITIONAL_METAPHONE
  545. else if (Next_Letter == 'C' &&
  546. Look_Ahead_Letter(2) == 'H' &&
  547. Look_Ahead_Letter(3) == 'W')
  548. {
  549. Phonize(SH);
  550. skip_letter += 2;
  551. }
  552. #endif
  553. else
  554. Phonize('S');
  555. break;
  556. /*
  557. * 'sh' in -TIA- or -TIO- else 'th' before H else T
  558. */
  559. case 'T':
  560. if (Next_Letter == 'I' &&
  561. (After_Next_Letter == 'O' ||
  562. After_Next_Letter == 'A'))
  563. Phonize(SH);
  564. else if (Next_Letter == 'H')
  565. {
  566. Phonize(TH);
  567. skip_letter++;
  568. }
  569. else
  570. Phonize('T');
  571. break;
  572. /* F */
  573. case 'V':
  574. Phonize('F');
  575. break;
  576. /* W before a vowel, else dropped */
  577. case 'W':
  578. if (isvowel(Next_Letter))
  579. Phonize('W');
  580. break;
  581. /* KS */
  582. case 'X':
  583. Phonize('K');
  584. if (max_phonemes == 0 || Phone_Len < max_phonemes)
  585. Phonize('S');
  586. break;
  587. /* Y if followed by a vowel */
  588. case 'Y':
  589. if (isvowel(Next_Letter))
  590. Phonize('Y');
  591. break;
  592. /* S */
  593. case 'Z':
  594. Phonize('S');
  595. break;
  596. /* No transformation */
  597. case 'F':
  598. case 'J':
  599. case 'L':
  600. case 'M':
  601. case 'N':
  602. case 'R':
  603. Phonize(Curr_Letter);
  604. break;
  605. default:
  606. /* nothing */
  607. break;
  608. } /* END SWITCH */
  609. w_idx += skip_letter;
  610. } /* END FOR */
  611. End_Phoned_Word;
  612. return;
  613. } /* END metaphone */
  614. /*
  615. * SQL function: soundex(text) returns text
  616. */
  617. PG_FUNCTION_INFO_V1(soundex);
  618. Datum
  619. soundex(PG_FUNCTION_ARGS)
  620. {
  621. char outstr[SOUNDEX_LEN + 1];
  622. char *arg;
  623. arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
  624. _soundex(arg, outstr);
  625. PG_RETURN_TEXT_P(cstring_to_text(outstr));
  626. }
  627. static void
  628. _soundex(const char *instr, char *outstr)
  629. {
  630. int count;
  631. AssertArg(instr);
  632. AssertArg(outstr);
  633. outstr[SOUNDEX_LEN] = '\0';
  634. /* Skip leading non-alphabetic characters */
  635. while (!isalpha((unsigned char) instr[0]) && instr[0])
  636. ++instr;
  637. /* No string left */
  638. if (!instr[0])
  639. {
  640. outstr[0] = (char) 0;
  641. return;
  642. }
  643. /* Take the first letter as is */
  644. *outstr++ = (char) toupper((unsigned char) *instr++);
  645. count = 1;
  646. while (*instr && count < SOUNDEX_LEN)
  647. {
  648. if (isalpha((unsigned char) *instr) &&
  649. soundex_code(*instr) != soundex_code(*(instr - 1)))
  650. {
  651. *outstr = soundex_code(instr[0]);
  652. if (*outstr != '0')
  653. {
  654. ++outstr;
  655. ++count;
  656. }
  657. }
  658. ++instr;
  659. }
  660. /* Fill with 0's */
  661. while (count < SOUNDEX_LEN)
  662. {
  663. *outstr = '0';
  664. ++outstr;
  665. ++count;
  666. }
  667. }
  668. PG_FUNCTION_INFO_V1(difference);
  669. Datum
  670. difference(PG_FUNCTION_ARGS)
  671. {
  672. char sndx1[SOUNDEX_LEN + 1],
  673. sndx2[SOUNDEX_LEN + 1];
  674. int i,
  675. result;
  676. _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
  677. _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
  678. result = 0;
  679. for (i = 0; i < SOUNDEX_LEN; i++)
  680. {
  681. if (sndx1[i] == sndx2[i])
  682. result++;
  683. }
  684. PG_RETURN_INT32(result);
  685. }