You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

trgm_regexp.c 68KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353
  1. /*-------------------------------------------------------------------------
  2. *
  3. * trgm_regexp.c
  4. * Regular expression matching using trigrams.
  5. *
  6. * The general idea of trigram index support for a regular expression (regex)
  7. * search is to transform the regex into a logical expression on trigrams.
  8. * For example:
  9. *
  10. * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
  11. *
  12. * If a string matches the regex, then it must match the logical expression on
  13. * trigrams. The opposite is not necessarily true, however: a string that
  14. * matches the logical expression might not match the original regex. Such
  15. * false positives are removed via recheck, by running the regular regex match
  16. * operator on the retrieved heap tuple.
  17. *
  18. * Since the trigram expression involves both AND and OR operators, we can't
  19. * expect the core index machinery to evaluate it completely. Instead, the
  20. * result of regex analysis is a list of trigrams to be sought in the index,
  21. * plus a simplified graph that is used by trigramsMatchGraph() to determine
  22. * whether a particular indexed value matches the expression.
  23. *
  24. * Converting a regex to a trigram expression is based on analysis of an
  25. * automaton corresponding to the regex. The algorithm consists of four
  26. * stages:
  27. *
  28. * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
  29. * regexp library, which provides accessors for its opaque regex_t struct
  30. * to expose the NFA state graph and the "colors" (sets of equivalent
  31. * characters) used as state transition labels.
  32. *
  33. * 2) Transform the original NFA into an expanded graph, where arcs
  34. * are labeled with trigrams that must be present in order to move from
  35. * one state to another via the arcs. The trigrams used in this stage
  36. * consist of colors, not characters, as in the original NFA.
  37. *
  38. * 3) Expand the color trigrams into regular trigrams consisting of
  39. * characters. If too many distinct trigrams are produced, trigrams are
  40. * eliminated and the graph is simplified until it's simple enough.
  41. *
  42. * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
  43. * and returned to the caller.
  44. *
  45. * 1) Compile the regexp to NFA form
  46. * ---------------------------------
  47. * The automaton returned by the regexp compiler is a graph where vertices
  48. * are "states" and arcs are labeled with colors. Each color represents
  49. * a set of characters, so that all characters assigned to the same color
  50. * are interchangeable, so far as matching the regexp is concerned. There
  51. * are two special states: "initial" and "final". A state can have multiple
  52. * outgoing arcs labeled with the same color, which makes the automaton
  53. * non-deterministic, because it can be in many states simultaneously.
  54. *
  55. * Note that this NFA is already lossy compared to the original regexp,
  56. * since it ignores some regex features such as lookahead constraints and
  57. * backref matching. This is OK for our purposes since it's still the case
  58. * that only strings matching the NFA can possibly satisfy the regexp.
  59. *
  60. * 2) Transform the original NFA into an expanded graph
  61. * ----------------------------------------------------
  62. * In the 2nd stage, the automaton is transformed into a graph based on the
  63. * original NFA. Each state in the expanded graph represents a state from
  64. * the original NFA, plus a prefix identifying the last two characters
  65. * (colors, to be precise) seen before entering the state. There can be
  66. * multiple states in the expanded graph for each state in the original NFA,
  67. * depending on what characters can precede it. A prefix position can be
  68. * "unknown" if it's uncertain what the preceding character was, or "blank"
  69. * if the character was a non-word character (we don't need to distinguish
  70. * which non-word character it was, so just think of all of them as blanks).
  71. *
  72. * For convenience in description, call an expanded-state identifier
  73. * (two prefix colors plus a state number from the original NFA) an
  74. * "enter key".
  75. *
  76. * Each arc of the expanded graph is labelled with a trigram that must be
  77. * present in the string to match. We can construct this from an out-arc of
  78. * the underlying NFA state by combining the expanded state's prefix with the
  79. * color label of the underlying out-arc, if neither prefix position is
  80. * "unknown". But note that some of the colors in the trigram might be
  81. * "blank". This is OK since we want to generate word-boundary trigrams as
  82. * the regular trigram machinery would, if we know that some word characters
  83. * must be adjacent to a word boundary in all strings matching the NFA.
  84. *
  85. * The expanded graph can also have fewer states than the original NFA,
  86. * because we don't bother to make a separate state entry unless the state
  87. * is reachable by a valid arc. When an enter key is reachable from a state
  88. * of the expanded graph, but we do not know a complete trigram associated
  89. * with that transition, we cannot make a valid arc; instead we insert the
  90. * enter key into the enterKeys list of the source state. This effectively
  91. * means that the two expanded states are not reliably distinguishable based
  92. * on examining trigrams.
  93. *
  94. * So the expanded graph resembles the original NFA, but the arcs are
  95. * labeled with trigrams instead of individual characters, and there may be
  96. * more or fewer states. It is a lossy representation of the original NFA:
  97. * any string that matches the original regexp must match the expanded graph,
  98. * but the reverse is not true.
  99. *
  100. * We build the expanded graph through a breadth-first traversal of states
  101. * reachable from the initial state. At each reachable state, we identify the
  102. * states reachable from it without traversing a predictable trigram, and add
  103. * those states' enter keys to the current state. Then we generate all
  104. * out-arcs leading out of this collection of states that have predictable
  105. * trigrams, adding their target states to the queue of states to examine.
  106. *
  107. * When building the graph, if the number of states or arcs exceed pre-defined
  108. * limits, we give up and simply mark any states not yet processed as final
  109. * states. Roughly speaking, that means that we make use of some portion from
  110. * the beginning of the regexp. Also, any colors that have too many member
  111. * characters are treated as "unknown", so that we can't derive trigrams
  112. * from them.
  113. *
  114. * 3) Expand the color trigrams into regular trigrams
  115. * --------------------------------------------------
  116. * The trigrams in the expanded graph are "color trigrams", consisting
  117. * of three consecutive colors that must be present in the string. But for
  118. * search, we need regular trigrams consisting of characters. In the 3rd
  119. * stage, the color trigrams are expanded into regular trigrams. Since each
  120. * color can represent many characters, the total number of regular trigrams
  121. * after expansion could be very large. Because searching the index for
  122. * thousands of trigrams would be slow, and would likely produce so many
  123. * false positives that we would have to traverse a large fraction of the
  124. * index, the graph is simplified further in a lossy fashion by removing
  125. * color trigrams. When a color trigram is removed, the states connected by
  126. * any arcs labelled with that trigram are merged.
  127. *
  128. * Trigrams do not all have equivalent value for searching: some of them are
  129. * more frequent and some of them are less frequent. Ideally, we would like
  130. * to know the distribution of trigrams, but we don't. But because of padding
  131. * we know for sure that the empty character is more frequent than others,
  132. * so we can penalize trigrams according to presence of whitespace. The
  133. * penalty assigned to each color trigram is the number of simple trigrams
  134. * it would produce, times the penalties[] multiplier associated with its
  135. * whitespace content. (The penalties[] constants were calculated by analysis
  136. * of some real-life text.) We eliminate color trigrams starting with the
  137. * highest-penalty one, until we get to a total penalty of no more than
  138. * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
  139. * lead to merging the initial and final states, so we may not be able to
  140. * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
  141. * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
  142. *
  143. * 4) Pack the graph into a compact representation
  144. * -----------------------------------------------
  145. * The 2nd and 3rd stages might have eliminated or merged many of the states
  146. * and trigrams created earlier, so in this final stage, the graph is
  147. * compacted and packed into a simpler struct that contains only the
  148. * information needed to evaluate it.
  149. *
  150. * ALGORITHM EXAMPLE:
  151. *
  152. * Consider the example regex "ab[cd]". This regex is transformed into the
  153. * following NFA (for simplicity we show colors as their single members):
  154. *
  155. * 4#
  156. * c/
  157. * a b /
  158. * 1* --- 2 ---- 3
  159. * \
  160. * d\
  161. * 5#
  162. *
  163. * We use * to mark initial state and # to mark final state. It's not depicted,
  164. * but states 1, 4, 5 have self-referencing arcs for all possible characters,
  165. * because this pattern can match to any part of a string.
  166. *
  167. * As the result of stage 2 we will have the following graph:
  168. *
  169. * abc abd
  170. * 2# <---- 1* ----> 3#
  171. *
  172. * The process for generating this graph is:
  173. * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
  174. * 2) Add key (UNKNOWN, "a", 2) to state 1.
  175. * 3) Add key ("a", "b", 3) to state 1.
  176. * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
  177. * from state 1 to state 2 with label trigram "abc".
  178. * 5) Mark state 2 final because state 4 of source NFA is marked as final.
  179. * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
  180. * from state 1 to state 3 with label trigram "abd".
  181. * 7) Mark state 3 final because state 5 of source NFA is marked as final.
  182. *
  183. *
  184. * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  185. * Portions Copyright (c) 1994, Regents of the University of California
  186. *
  187. * IDENTIFICATION
  188. * contrib/pg_trgm/trgm_regexp.c
  189. *
  190. *-------------------------------------------------------------------------
  191. */
  192. #include "postgres.h"
  193. #include "trgm.h"
  194. #include "regex/regexport.h"
  195. #include "tsearch/ts_locale.h"
  196. #include "utils/hsearch.h"
  197. #include "utils/memutils.h"
  198. /*
  199. * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
  200. * for exploring and debugging the algorithm implementation.
  201. * This produces three graph files in /tmp, in Graphviz .dot format.
  202. * Some progress information is also printed to postmaster stderr.
  203. */
  204. /* #define TRGM_REGEXP_DEBUG */
  205. /*
  206. * These parameters are used to limit the amount of work done.
  207. * Otherwise regex processing could be too slow and memory-consuming.
  208. *
  209. * MAX_EXPANDED_STATES - How many states we allow in expanded graph
  210. * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
  211. * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
  212. * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
  213. * COLOR_COUNT_LIMIT - Maximum number of characters per color
  214. */
  215. #define MAX_EXPANDED_STATES 128
  216. #define MAX_EXPANDED_ARCS 1024
  217. #define MAX_TRGM_COUNT 256
  218. #define WISH_TRGM_PENALTY 16
  219. #define COLOR_COUNT_LIMIT 256
  220. /*
  221. * Penalty multipliers for trigram counts depending on whitespace contents.
  222. * Numbers based on analysis of real-life texts.
  223. */
  224. static const float4 penalties[8] = {
  225. 1.0f, /* "aaa" */
  226. 3.5f, /* "aa " */
  227. 0.0f, /* "a a" (impossible) */
  228. 0.0f, /* "a " (impossible) */
  229. 4.2f, /* " aa" */
  230. 2.1f, /* " a " */
  231. 25.0f, /* " a" */
  232. 0.0f /* " " (impossible) */
  233. };
  234. /* Struct representing a single pg_wchar, converted back to multibyte form */
  235. typedef struct
  236. {
  237. char bytes[MAX_MULTIBYTE_CHAR_LEN];
  238. } trgm_mb_char;
  239. /*
  240. * Attributes of NFA colors:
  241. *
  242. * expandable - we know the character expansion of this color
  243. * containsNonWord - color contains non-word characters
  244. * (which will not be extracted into trigrams)
  245. * wordCharsCount - count of word characters in color
  246. * wordChars - array of this color's word characters
  247. * (which can be extracted into trigrams)
  248. *
  249. * When expandable is false, the other attributes don't matter; we just
  250. * assume this color represents unknown character(s).
  251. */
  252. typedef struct
  253. {
  254. bool expandable;
  255. bool containsNonWord;
  256. int wordCharsCount;
  257. trgm_mb_char *wordChars;
  258. } TrgmColorInfo;
  259. /*
  260. * A "prefix" is information about the colors of the last two characters read
  261. * before reaching a specific NFA state. These colors can have special values
  262. * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
  263. * information, for example because we read some character of an unexpandable
  264. * color. COLOR_BLANK means that we read a non-word character.
  265. *
  266. * We call a prefix ambiguous if at least one of its colors is unknown. It's
  267. * fully ambiguous if both are unknown, partially ambiguous if only the first
  268. * is unknown. (The case of first color known, second unknown is not valid.)
  269. *
  270. * Wholly- or partly-blank prefixes are mostly handled the same as regular
  271. * color prefixes. This allows us to generate appropriate partly-blank
  272. * trigrams when the NFA requires word character(s) to appear adjacent to
  273. * non-word character(s).
  274. */
  275. typedef int TrgmColor;
  276. /* We assume that colors returned by the regexp engine cannot be these: */
  277. #define COLOR_UNKNOWN (-1)
  278. #define COLOR_BLANK (-2)
  279. typedef struct
  280. {
  281. TrgmColor colors[2];
  282. } TrgmPrefix;
  283. /*
  284. * Color-trigram data type. Note that some elements of the trigram can be
  285. * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
  286. */
  287. typedef struct
  288. {
  289. TrgmColor colors[3];
  290. } ColorTrgm;
  291. /*
  292. * Key identifying a state of our expanded graph: color prefix, and number
  293. * of the corresponding state in the underlying regex NFA. The color prefix
  294. * shows how we reached the regex state (to the extent that we know it).
  295. */
  296. typedef struct
  297. {
  298. TrgmPrefix prefix;
  299. int nstate;
  300. } TrgmStateKey;
  301. /*
  302. * One state of the expanded graph.
  303. *
  304. * stateKey - ID of this state
  305. * arcs - outgoing arcs of this state (List of TrgmArc)
  306. * enterKeys - enter keys reachable from this state without reading any
  307. * predictable trigram (List of TrgmStateKey)
  308. * flags - flag bits
  309. * snumber - number of this state (initially assigned as -1, -2, etc,
  310. * for debugging purposes only; then at the packaging stage,
  311. * surviving states are renumbered with positive numbers)
  312. * parent - parent state, if this state has been merged into another
  313. * tentFlags - flags this state would acquire via planned merges
  314. * tentParent - planned parent state, if considering a merge
  315. */
  316. #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
  317. #define TSTATE_FIN 0x02 /* flag indicating this state is final */
  318. typedef struct TrgmState
  319. {
  320. TrgmStateKey stateKey; /* hashtable key: must be first field */
  321. List *arcs;
  322. List *enterKeys;
  323. int flags;
  324. int snumber;
  325. struct TrgmState *parent;
  326. int tentFlags;
  327. struct TrgmState *tentParent;
  328. } TrgmState;
  329. /*
  330. * One arc in the expanded graph.
  331. */
  332. typedef struct
  333. {
  334. ColorTrgm ctrgm; /* trigram needed to traverse arc */
  335. TrgmState *target; /* next state */
  336. } TrgmArc;
  337. /*
  338. * Information about arc of specific color trigram (used in stage 3)
  339. *
  340. * Contains pointers to the source and target states.
  341. */
  342. typedef struct
  343. {
  344. TrgmState *source;
  345. TrgmState *target;
  346. } TrgmArcInfo;
  347. /*
  348. * Information about color trigram (used in stage 3)
  349. *
  350. * ctrgm - trigram itself
  351. * cnumber - number of this trigram (used in the packaging stage)
  352. * count - number of simple trigrams created from this color trigram
  353. * expanded - indicates this color trigram is expanded into simple trigrams
  354. * arcs - list of all arcs labeled with this color trigram.
  355. */
  356. typedef struct
  357. {
  358. ColorTrgm ctrgm;
  359. int cnumber;
  360. int count;
  361. float4 penalty;
  362. bool expanded;
  363. List *arcs;
  364. } ColorTrgmInfo;
  365. /*
  366. * Data structure representing all the data we need during regex processing.
  367. *
  368. * regex - compiled regex
  369. * colorInfo - extracted information about regex's colors
  370. * ncolors - number of colors in colorInfo[]
  371. * states - hashtable of TrgmStates (states of expanded graph)
  372. * initState - pointer to initial state of expanded graph
  373. * queue - queue of to-be-processed TrgmStates
  374. * keysQueue - queue of to-be-processed TrgmStateKeys
  375. * arcsCount - total number of arcs of expanded graph (for resource
  376. * limiting)
  377. * overflowed - we have exceeded resource limit for transformation
  378. * colorTrgms - array of all color trigrams present in graph
  379. * colorTrgmsCount - count of those color trigrams
  380. * totalTrgmCount - total count of extracted simple trigrams
  381. */
  382. typedef struct
  383. {
  384. /* Source regexp, and color information extracted from it (stage 1) */
  385. regex_t *regex;
  386. TrgmColorInfo *colorInfo;
  387. int ncolors;
  388. /* Expanded graph (stage 2) */
  389. HTAB *states;
  390. TrgmState *initState;
  391. int nstates;
  392. /* Workspace for stage 2 */
  393. List *queue;
  394. List *keysQueue;
  395. int arcsCount;
  396. bool overflowed;
  397. /* Information about distinct color trigrams in the graph (stage 3) */
  398. ColorTrgmInfo *colorTrgms;
  399. int colorTrgmsCount;
  400. int totalTrgmCount;
  401. } TrgmNFA;
  402. /*
  403. * Final, compact representation of expanded graph.
  404. */
  405. typedef struct
  406. {
  407. int targetState; /* index of target state (zero-based) */
  408. int colorTrgm; /* index of color trigram for transition */
  409. } TrgmPackedArc;
  410. typedef struct
  411. {
  412. int arcsCount; /* number of out-arcs for this state */
  413. TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
  414. } TrgmPackedState;
  415. /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
  416. struct TrgmPackedGraph
  417. {
  418. /*
  419. * colorTrigramsCount and colorTrigramsGroups contain information about
  420. * how trigrams are grouped into color trigrams. "colorTrigramsCount" is
  421. * the count of color trigrams and "colorTrigramGroups" contains number of
  422. * simple trigrams for each color trigram. The array of simple trigrams
  423. * (stored separately from this struct) is ordered so that the simple
  424. * trigrams for each color trigram are consecutive, and they're in order
  425. * by color trigram number.
  426. */
  427. int colorTrigramsCount;
  428. int *colorTrigramGroups; /* array of size colorTrigramsCount */
  429. /*
  430. * The states of the simplified NFA. State number 0 is always initial
  431. * state and state number 1 is always final state.
  432. */
  433. int statesCount;
  434. TrgmPackedState *states; /* array of size statesCount */
  435. /* Temporary work space for trigramsMatchGraph() */
  436. bool *colorTrigramsActive; /* array of size colorTrigramsCount */
  437. bool *statesActive; /* array of size statesCount */
  438. int *statesQueue; /* array of size statesCount */
  439. };
  440. /*
  441. * Temporary structure for representing an arc during packaging.
  442. */
  443. typedef struct
  444. {
  445. int sourceState;
  446. int targetState;
  447. int colorTrgm;
  448. } TrgmPackArcInfo;
  449. /* prototypes for private functions */
  450. static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
  451. MemoryContext rcontext);
  452. static void RE_compile(regex_t *regex, text *text_re,
  453. int cflags, Oid collation);
  454. static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
  455. static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
  456. static void transformGraph(TrgmNFA *trgmNFA);
  457. static void processState(TrgmNFA *trgmNFA, TrgmState *state);
  458. static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
  459. static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
  460. static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
  461. static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
  462. TrgmColor co, TrgmStateKey *destKey);
  463. static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
  464. static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
  465. static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
  466. static bool selectColorTrigrams(TrgmNFA *trgmNFA);
  467. static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
  468. static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
  469. static void mergeStates(TrgmState *state1, TrgmState *state2);
  470. static int colorTrgmInfoCmp(const void *p1, const void *p2);
  471. static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
  472. static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
  473. static int packArcInfoCmp(const void *a1, const void *a2);
  474. #ifdef TRGM_REGEXP_DEBUG
  475. static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
  476. static void printTrgmNFA(TrgmNFA *trgmNFA);
  477. static void printTrgmColor(StringInfo buf, TrgmColor co);
  478. static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
  479. #endif
  480. /*
  481. * Main entry point to process a regular expression.
  482. *
  483. * Returns an array of trigrams required by the regular expression, or NULL if
  484. * the regular expression was too complex to analyze. In addition, a packed
  485. * graph representation of the regex is returned into *graph. The results
  486. * must be allocated in rcontext (which might or might not be the current
  487. * context).
  488. */
  489. TRGM *
  490. createTrgmNFA(text *text_re, Oid collation,
  491. TrgmPackedGraph **graph, MemoryContext rcontext)
  492. {
  493. TRGM *trg;
  494. regex_t regex;
  495. MemoryContext tmpcontext;
  496. MemoryContext oldcontext;
  497. /*
  498. * This processing generates a great deal of cruft, which we'd like to
  499. * clean up before returning (since this function may be called in a
  500. * query-lifespan memory context). Make a temp context we can work in so
  501. * that cleanup is easy.
  502. */
  503. tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
  504. "createTrgmNFA temporary context",
  505. ALLOCSET_DEFAULT_SIZES);
  506. oldcontext = MemoryContextSwitchTo(tmpcontext);
  507. /*
  508. * Stage 1: Compile the regexp into a NFA, using the regexp library.
  509. */
  510. #ifdef IGNORECASE
  511. RE_compile(&regex, text_re, REG_ADVANCED | REG_ICASE, collation);
  512. #else
  513. RE_compile(&regex, text_re, REG_ADVANCED, collation);
  514. #endif
  515. /*
  516. * Since the regexp library allocates its internal data structures with
  517. * malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets
  518. * done even if there's an error.
  519. */
  520. PG_TRY();
  521. {
  522. trg = createTrgmNFAInternal(&regex, graph, rcontext);
  523. }
  524. PG_CATCH();
  525. {
  526. pg_regfree(&regex);
  527. PG_RE_THROW();
  528. }
  529. PG_END_TRY();
  530. pg_regfree(&regex);
  531. /* Clean up all the cruft we created */
  532. MemoryContextSwitchTo(oldcontext);
  533. MemoryContextDelete(tmpcontext);
  534. return trg;
  535. }
  536. /*
  537. * Body of createTrgmNFA, exclusive of regex compilation/freeing.
  538. */
  539. static TRGM *
  540. createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
  541. MemoryContext rcontext)
  542. {
  543. TRGM *trg;
  544. TrgmNFA trgmNFA;
  545. trgmNFA.regex = regex;
  546. /* Collect color information from the regex */
  547. getColorInfo(regex, &trgmNFA);
  548. #ifdef TRGM_REGEXP_DEBUG
  549. printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
  550. #endif
  551. /*
  552. * Stage 2: Create an expanded graph from the source NFA.
  553. */
  554. transformGraph(&trgmNFA);
  555. #ifdef TRGM_REGEXP_DEBUG
  556. printTrgmNFA(&trgmNFA);
  557. #endif
  558. /*
  559. * Fail if we were unable to make a nontrivial graph, ie it is possible to
  560. * get from the initial state to the final state without reading any
  561. * predictable trigram.
  562. */
  563. if (trgmNFA.initState->flags & TSTATE_FIN)
  564. return NULL;
  565. /*
  566. * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
  567. */
  568. if (!selectColorTrigrams(&trgmNFA))
  569. return NULL;
  570. /*
  571. * Stage 4: Expand color trigrams and pack graph into final
  572. * representation.
  573. */
  574. trg = expandColorTrigrams(&trgmNFA, rcontext);
  575. *graph = packGraph(&trgmNFA, rcontext);
  576. #ifdef TRGM_REGEXP_DEBUG
  577. printTrgmPackedGraph(*graph, trg);
  578. #endif
  579. return trg;
  580. }
  581. /*
  582. * Main entry point for evaluating a graph during index scanning.
  583. *
  584. * The check[] array is indexed by trigram number (in the array of simple
  585. * trigrams returned by createTrgmNFA), and holds true for those trigrams
  586. * that are present in the index entry being checked.
  587. */
  588. bool
  589. trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
  590. {
  591. int i,
  592. j,
  593. k,
  594. queueIn,
  595. queueOut;
  596. /*
  597. * Reset temporary working areas.
  598. */
  599. memset(graph->colorTrigramsActive, 0,
  600. sizeof(bool) * graph->colorTrigramsCount);
  601. memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
  602. /*
  603. * Check which color trigrams were matched. A match for any simple
  604. * trigram associated with a color trigram counts as a match of the color
  605. * trigram.
  606. */
  607. j = 0;
  608. for (i = 0; i < graph->colorTrigramsCount; i++)
  609. {
  610. int cnt = graph->colorTrigramGroups[i];
  611. for (k = j; k < j + cnt; k++)
  612. {
  613. if (check[k])
  614. {
  615. /*
  616. * Found one matched trigram in the group. Can skip the rest
  617. * of them and go to the next group.
  618. */
  619. graph->colorTrigramsActive[i] = true;
  620. break;
  621. }
  622. }
  623. j = j + cnt;
  624. }
  625. /*
  626. * Initialize the statesQueue to hold just the initial state. Note:
  627. * statesQueue has room for statesCount entries, which is certainly enough
  628. * since no state will be put in the queue more than once. The
  629. * statesActive array marks which states have been queued.
  630. */
  631. graph->statesActive[0] = true;
  632. graph->statesQueue[0] = 0;
  633. queueIn = 0;
  634. queueOut = 1;
  635. /* Process queued states as long as there are any. */
  636. while (queueIn < queueOut)
  637. {
  638. int stateno = graph->statesQueue[queueIn++];
  639. TrgmPackedState *state = &graph->states[stateno];
  640. int cnt = state->arcsCount;
  641. /* Loop over state's out-arcs */
  642. for (i = 0; i < cnt; i++)
  643. {
  644. TrgmPackedArc *arc = &state->arcs[i];
  645. /*
  646. * If corresponding color trigram is present then activate the
  647. * corresponding state. We're done if that's the final state,
  648. * otherwise queue the state if it's not been queued already.
  649. */
  650. if (graph->colorTrigramsActive[arc->colorTrgm])
  651. {
  652. int nextstate = arc->targetState;
  653. if (nextstate == 1)
  654. return true; /* success: final state is reachable */
  655. if (!graph->statesActive[nextstate])
  656. {
  657. graph->statesActive[nextstate] = true;
  658. graph->statesQueue[queueOut++] = nextstate;
  659. }
  660. }
  661. }
  662. }
  663. /* Queue is empty, so match fails. */
  664. return false;
  665. }
  666. /*
  667. * Compile regex string into struct at *regex.
  668. * NB: pg_regfree must be applied to regex if this completes successfully.
  669. */
  670. static void
  671. RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
  672. {
  673. int text_re_len = VARSIZE_ANY_EXHDR(text_re);
  674. char *text_re_val = VARDATA_ANY(text_re);
  675. pg_wchar *pattern;
  676. int pattern_len;
  677. int regcomp_result;
  678. char errMsg[100];
  679. /* Convert pattern string to wide characters */
  680. pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
  681. pattern_len = pg_mb2wchar_with_len(text_re_val,
  682. pattern,
  683. text_re_len);
  684. /* Compile regex */
  685. regcomp_result = pg_regcomp(regex,
  686. pattern,
  687. pattern_len,
  688. cflags,
  689. collation);
  690. pfree(pattern);
  691. if (regcomp_result != REG_OKAY)
  692. {
  693. /* re didn't compile (no need for pg_regfree, if so) */
  694. pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
  695. ereport(ERROR,
  696. (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
  697. errmsg("invalid regular expression: %s", errMsg)));
  698. }
  699. }
  700. /*---------------------
  701. * Subroutines for pre-processing the color map (stage 1).
  702. *---------------------
  703. */
  704. /*
  705. * Fill TrgmColorInfo structure for each color using regex export functions.
  706. */
  707. static void
  708. getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
  709. {
  710. int colorsCount = pg_reg_getnumcolors(regex);
  711. int i;
  712. trgmNFA->ncolors = colorsCount;
  713. trgmNFA->colorInfo = (TrgmColorInfo *)
  714. palloc0(colorsCount * sizeof(TrgmColorInfo));
  715. /*
  716. * Loop over colors, filling TrgmColorInfo about each.
  717. */
  718. for (i = 0; i < colorsCount; i++)
  719. {
  720. TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
  721. int charsCount = pg_reg_getnumcharacters(regex, i);
  722. pg_wchar *chars;
  723. int j;
  724. if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
  725. {
  726. /* Non expandable, or too large to work with */
  727. colorInfo->expandable = false;
  728. continue;
  729. }
  730. colorInfo->expandable = true;
  731. colorInfo->containsNonWord = false;
  732. colorInfo->wordChars = (trgm_mb_char *)
  733. palloc(sizeof(trgm_mb_char) * charsCount);
  734. colorInfo->wordCharsCount = 0;
  735. /* Extract all the chars in this color */
  736. chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount);
  737. pg_reg_getcharacters(regex, i, chars, charsCount);
  738. /*
  739. * Convert characters back to multibyte form, and save only those that
  740. * are word characters. Set "containsNonWord" if any non-word
  741. * character. (Note: it'd probably be nicer to keep the chars in
  742. * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
  743. */
  744. for (j = 0; j < charsCount; j++)
  745. {
  746. trgm_mb_char c;
  747. if (!convertPgWchar(chars[j], &c))
  748. continue; /* ok to ignore it altogether */
  749. if (ISWORDCHR(c.bytes))
  750. colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
  751. else
  752. colorInfo->containsNonWord = true;
  753. }
  754. pfree(chars);
  755. }
  756. }
  757. /*
  758. * Convert pg_wchar to multibyte format.
  759. * Returns false if the character should be ignored completely.
  760. */
  761. static bool
  762. convertPgWchar(pg_wchar c, trgm_mb_char *result)
  763. {
  764. /* "s" has enough space for a multibyte character and a trailing NUL */
  765. char s[MAX_MULTIBYTE_CHAR_LEN + 1];
  766. /*
  767. * We can ignore the NUL character, since it can never appear in a PG text
  768. * string. This avoids the need for various special cases when
  769. * reconstructing trigrams.
  770. */
  771. if (c == 0)
  772. return false;
  773. /* Do the conversion, making sure the result is NUL-terminated */
  774. memset(s, 0, sizeof(s));
  775. pg_wchar2mb_with_len(&c, s, 1);
  776. /*
  777. * In IGNORECASE mode, we can ignore uppercase characters. We assume that
  778. * the regex engine generated both uppercase and lowercase equivalents
  779. * within each color, since we used the REG_ICASE option; so there's no
  780. * need to process the uppercase version.
  781. *
  782. * XXX this code is dependent on the assumption that lowerstr() works the
  783. * same as the regex engine's internal case folding machinery. Might be
  784. * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c).
  785. * On the other hand, the trigrams in the index were created using
  786. * lowerstr(), so we're probably screwed if there's any incompatibility
  787. * anyway.
  788. */
  789. #ifdef IGNORECASE
  790. {
  791. char *lowerCased = lowerstr(s);
  792. if (strcmp(lowerCased, s) != 0)
  793. {
  794. pfree(lowerCased);
  795. return false;
  796. }
  797. pfree(lowerCased);
  798. }
  799. #endif
  800. /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
  801. memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
  802. return true;
  803. }
  804. /*---------------------
  805. * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
  806. *---------------------
  807. */
  808. /*
  809. * Transform the graph, given a regex and extracted color information.
  810. *
  811. * We create and process a queue of expanded-graph states until all the states
  812. * are processed.
  813. *
  814. * This algorithm may be stopped due to resource limitation. In this case we
  815. * force every unprocessed branch to immediately finish with matching (this
  816. * can give us false positives but no false negatives) by marking all
  817. * unprocessed states as final.
  818. */
  819. static void
  820. transformGraph(TrgmNFA *trgmNFA)
  821. {
  822. HASHCTL hashCtl;
  823. TrgmStateKey initkey;
  824. TrgmState *initstate;
  825. /* Initialize this stage's workspace in trgmNFA struct */
  826. trgmNFA->queue = NIL;
  827. trgmNFA->keysQueue = NIL;
  828. trgmNFA->arcsCount = 0;
  829. trgmNFA->overflowed = false;
  830. /* Create hashtable for states */
  831. hashCtl.keysize = sizeof(TrgmStateKey);
  832. hashCtl.entrysize = sizeof(TrgmState);
  833. hashCtl.hcxt = CurrentMemoryContext;
  834. trgmNFA->states = hash_create("Trigram NFA",
  835. 1024,
  836. &hashCtl,
  837. HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
  838. trgmNFA->nstates = 0;
  839. /* Create initial state: ambiguous prefix, NFA's initial state */
  840. MemSet(&initkey, 0, sizeof(initkey));
  841. initkey.prefix.colors[0] = COLOR_UNKNOWN;
  842. initkey.prefix.colors[1] = COLOR_UNKNOWN;
  843. initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
  844. initstate = getState(trgmNFA, &initkey);
  845. initstate->flags |= TSTATE_INIT;
  846. trgmNFA->initState = initstate;
  847. /*
  848. * Recursively build the expanded graph by processing queue of states
  849. * (breadth-first search). getState already put initstate in the queue.
  850. */
  851. while (trgmNFA->queue != NIL)
  852. {
  853. TrgmState *state = (TrgmState *) linitial(trgmNFA->queue);
  854. trgmNFA->queue = list_delete_first(trgmNFA->queue);
  855. /*
  856. * If we overflowed then just mark state as final. Otherwise do
  857. * actual processing.
  858. */
  859. if (trgmNFA->overflowed)
  860. state->flags |= TSTATE_FIN;
  861. else
  862. processState(trgmNFA, state);
  863. /* Did we overflow? */
  864. if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
  865. hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES)
  866. trgmNFA->overflowed = true;
  867. }
  868. }
  869. /*
  870. * Process one state: add enter keys and then add outgoing arcs.
  871. */
  872. static void
  873. processState(TrgmNFA *trgmNFA, TrgmState *state)
  874. {
  875. /* keysQueue should be NIL already, but make sure */
  876. trgmNFA->keysQueue = NIL;
  877. /*
  878. * Add state's own key, and then process all keys added to keysQueue until
  879. * queue is empty. But we can quit if the state gets marked final.
  880. */
  881. addKey(trgmNFA, state, &state->stateKey);
  882. while (trgmNFA->keysQueue != NIL && !(state->flags & TSTATE_FIN))
  883. {
  884. TrgmStateKey *key = (TrgmStateKey *) linitial(trgmNFA->keysQueue);
  885. trgmNFA->keysQueue = list_delete_first(trgmNFA->keysQueue);
  886. addKey(trgmNFA, state, key);
  887. }
  888. /*
  889. * Add outgoing arcs only if state isn't final (we have no interest in
  890. * outgoing arcs if we already match)
  891. */
  892. if (!(state->flags & TSTATE_FIN))
  893. addArcs(trgmNFA, state);
  894. }
  895. /*
  896. * Add the given enter key into the state's enterKeys list, and determine
  897. * whether this should result in any further enter keys being added.
  898. * If so, add those keys to keysQueue so that processState will handle them.
  899. *
  900. * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
  901. * This situation means that we can reach the final state from this expanded
  902. * state without reading any predictable trigram, so we must consider this
  903. * state as an accepting one.
  904. *
  905. * The given key could be a duplicate of one already in enterKeys, or be
  906. * redundant with some enterKeys. So we check that before doing anything.
  907. *
  908. * Note that we don't generate any actual arcs here. addArcs will do that
  909. * later, after we have identified all the enter keys for this state.
  910. */
  911. static void
  912. addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
  913. {
  914. regex_arc_t *arcs;
  915. TrgmStateKey destKey;
  916. ListCell *cell,
  917. *prev,
  918. *next;
  919. int i,
  920. arcsCount;
  921. /*
  922. * Ensure any pad bytes in destKey are zero, since it may get used as a
  923. * hashtable key by getState.
  924. */
  925. MemSet(&destKey, 0, sizeof(destKey));
  926. /*
  927. * Compare key to each existing enter key of the state to check for
  928. * redundancy. We can drop either old key(s) or the new key if we find
  929. * redundancy.
  930. */
  931. prev = NULL;
  932. cell = list_head(state->enterKeys);
  933. while (cell)
  934. {
  935. TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
  936. next = lnext(cell);
  937. if (existingKey->nstate == key->nstate)
  938. {
  939. if (prefixContains(&existingKey->prefix, &key->prefix))
  940. {
  941. /* This old key already covers the new key. Nothing to do */
  942. return;
  943. }
  944. if (prefixContains(&key->prefix, &existingKey->prefix))
  945. {
  946. /*
  947. * The new key covers this old key. Remove the old key, it's
  948. * no longer needed once we add this key to the list.
  949. */
  950. state->enterKeys = list_delete_cell(state->enterKeys,
  951. cell, prev);
  952. }
  953. else
  954. prev = cell;
  955. }
  956. else
  957. prev = cell;
  958. cell = next;
  959. }
  960. /* No redundancy, so add this key to the state's list */
  961. state->enterKeys = lappend(state->enterKeys, key);
  962. /* If state is now known final, mark it and we're done */
  963. if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
  964. {
  965. state->flags |= TSTATE_FIN;
  966. return;
  967. }
  968. /*
  969. * Loop through all outgoing arcs of the corresponding state in the
  970. * original NFA.
  971. */
  972. arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
  973. arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
  974. pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
  975. for (i = 0; i < arcsCount; i++)
  976. {
  977. regex_arc_t *arc = &arcs[i];
  978. if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
  979. {
  980. /*
  981. * Start of line/string (^). Trigram extraction treats start of
  982. * line same as start of word: double space prefix is added.
  983. * Hence, make an enter key showing we can reach the arc
  984. * destination with all-blank prefix.
  985. */
  986. destKey.prefix.colors[0] = COLOR_BLANK;
  987. destKey.prefix.colors[1] = COLOR_BLANK;
  988. destKey.nstate = arc->to;
  989. /* Add enter key to this state */
  990. addKeyToQueue(trgmNFA, &destKey);
  991. }
  992. else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
  993. {
  994. /*
  995. * End of line/string ($). We must consider this arc as a
  996. * transition that doesn't read anything. The reason for adding
  997. * this enter key to the state is that if the arc leads to the
  998. * NFA's final state, we must mark this expanded state as final.
  999. */
  1000. destKey.prefix.colors[0] = COLOR_UNKNOWN;
  1001. destKey.prefix.colors[1] = COLOR_UNKNOWN;
  1002. destKey.nstate = arc->to;
  1003. /* Add enter key to this state */
  1004. addKeyToQueue(trgmNFA, &destKey);
  1005. }
  1006. else
  1007. {
  1008. /* Regular color */
  1009. TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
  1010. if (colorInfo->expandable)
  1011. {
  1012. if (colorInfo->containsNonWord &&
  1013. !validArcLabel(key, COLOR_BLANK))
  1014. {
  1015. /*
  1016. * We can reach the arc destination after reading a
  1017. * non-word character, but the prefix is not something
  1018. * that addArc will accept with COLOR_BLANK, so no trigram
  1019. * arc can get made for this transition. We must make an
  1020. * enter key to show that the arc destination is
  1021. * reachable. Set it up with an all-blank prefix, since
  1022. * that corresponds to what the trigram extraction code
  1023. * will do at a word starting boundary.
  1024. */
  1025. destKey.prefix.colors[0] = COLOR_BLANK;
  1026. destKey.prefix.colors[1] = COLOR_BLANK;
  1027. destKey.nstate = arc->to;
  1028. addKeyToQueue(trgmNFA, &destKey);
  1029. }
  1030. if (colorInfo->wordCharsCount > 0 &&
  1031. !validArcLabel(key, arc->co))
  1032. {
  1033. /*
  1034. * We can reach the arc destination after reading a word
  1035. * character, but the prefix is not something that addArc
  1036. * will accept, so no trigram arc can get made for this
  1037. * transition. We must make an enter key to show that the
  1038. * arc destination is reachable. The prefix for the enter
  1039. * key should reflect the info we have for this arc.
  1040. */
  1041. destKey.prefix.colors[0] = key->prefix.colors[1];
  1042. destKey.prefix.colors[1] = arc->co;
  1043. destKey.nstate = arc->to;
  1044. addKeyToQueue(trgmNFA, &destKey);
  1045. }
  1046. }
  1047. else
  1048. {
  1049. /*
  1050. * Unexpandable color. Add enter key with ambiguous prefix,
  1051. * showing we can reach the destination from this state, but
  1052. * the preceding colors will be uncertain. (We do not set the
  1053. * first prefix color to key->prefix.colors[1], because a
  1054. * prefix of known followed by unknown is invalid.)
  1055. */
  1056. destKey.prefix.colors[0] = COLOR_UNKNOWN;
  1057. destKey.prefix.colors[1] = COLOR_UNKNOWN;
  1058. destKey.nstate = arc->to;
  1059. addKeyToQueue(trgmNFA, &destKey);
  1060. }
  1061. }
  1062. }
  1063. pfree(arcs);
  1064. }
  1065. /*
  1066. * Add copy of given key to keysQueue for later processing.
  1067. */
  1068. static void
  1069. addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
  1070. {
  1071. TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey));
  1072. memcpy(keyCopy, key, sizeof(TrgmStateKey));
  1073. trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
  1074. }
  1075. /*
  1076. * Add outgoing arcs from given state, whose enter keys are all now known.
  1077. */
  1078. static void
  1079. addArcs(TrgmNFA *trgmNFA, TrgmState *state)
  1080. {
  1081. TrgmStateKey destKey;
  1082. ListCell *cell;
  1083. regex_arc_t *arcs;
  1084. int arcsCount,
  1085. i;
  1086. /*
  1087. * Ensure any pad bytes in destKey are zero, since it may get used as a
  1088. * hashtable key by getState.
  1089. */
  1090. MemSet(&destKey, 0, sizeof(destKey));
  1091. /*
  1092. * Iterate over enter keys associated with this expanded-graph state. This
  1093. * includes both the state's own stateKey, and any enter keys we added to
  1094. * it during addKey (which represent expanded-graph states that are not
  1095. * distinguishable from this one by means of trigrams). For each such
  1096. * enter key, examine all the out-arcs of the key's underlying NFA state,
  1097. * and try to make a trigram arc leading to where the out-arc leads.
  1098. * (addArc will deal with whether the arc is valid or not.)
  1099. */
  1100. foreach(cell, state->enterKeys)
  1101. {
  1102. TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
  1103. arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
  1104. arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
  1105. pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
  1106. for (i = 0; i < arcsCount; i++)
  1107. {
  1108. regex_arc_t *arc = &arcs[i];
  1109. TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
  1110. /*
  1111. * Ignore non-expandable colors; addKey already handled the case.
  1112. *
  1113. * We need no special check for begin/end pseudocolors here. We
  1114. * don't need to do any processing for them, and they will be
  1115. * marked non-expandable since the regex engine will have reported
  1116. * them that way.
  1117. */
  1118. if (!colorInfo->expandable)
  1119. continue;
  1120. if (colorInfo->containsNonWord)
  1121. {
  1122. /*
  1123. * Color includes non-word character(s).
  1124. *
  1125. * Generate an arc, treating this transition as occurring on
  1126. * BLANK. This allows word-ending trigrams to be manufactured
  1127. * if possible.
  1128. */
  1129. destKey.prefix.colors[0] = key->prefix.colors[1];
  1130. destKey.prefix.colors[1] = COLOR_BLANK;
  1131. destKey.nstate = arc->to;
  1132. addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
  1133. }
  1134. if (colorInfo->wordCharsCount > 0)
  1135. {
  1136. /*
  1137. * Color includes word character(s).
  1138. *
  1139. * Generate an arc. Color is pushed into prefix of target
  1140. * state.
  1141. */
  1142. destKey.prefix.colors[0] = key->prefix.colors[1];
  1143. destKey.prefix.colors[1] = arc->co;
  1144. destKey.nstate = arc->to;
  1145. addArc(trgmNFA, state, key, arc->co, &destKey);
  1146. }
  1147. }
  1148. pfree(arcs);
  1149. }
  1150. }
  1151. /*
  1152. * Generate an out-arc of the expanded graph, if it's valid and not redundant.
  1153. *
  1154. * state: expanded-graph state we want to add an out-arc to
  1155. * key: provides prefix colors (key->nstate is not used)
  1156. * co: transition color
  1157. * destKey: identifier for destination state of expanded graph
  1158. */
  1159. static void
  1160. addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
  1161. TrgmColor co, TrgmStateKey *destKey)
  1162. {
  1163. TrgmArc *arc;
  1164. ListCell *cell;
  1165. /* Do nothing if this wouldn't be a valid arc label trigram */
  1166. if (!validArcLabel(key, co))
  1167. return;
  1168. /*
  1169. * Check if we are going to reach key which is covered by a key which is
  1170. * already listed in this state. If so arc is useless: the NFA can bypass
  1171. * it through a path that doesn't require any predictable trigram, so
  1172. * whether the arc's trigram is present or not doesn't really matter.
  1173. */
  1174. foreach(cell, state->enterKeys)
  1175. {
  1176. TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
  1177. if (existingKey->nstate == destKey->nstate &&
  1178. prefixContains(&existingKey->prefix, &destKey->prefix))
  1179. return;
  1180. }
  1181. /* Checks were successful, add new arc */
  1182. arc = (TrgmArc *) palloc(sizeof(TrgmArc));
  1183. arc->target = getState(trgmNFA, destKey);
  1184. arc->ctrgm.colors[0] = key->prefix.colors[0];
  1185. arc->ctrgm.colors[1] = key->prefix.colors[1];
  1186. arc->ctrgm.colors[2] = co;
  1187. state->arcs = lappend(state->arcs, arc);
  1188. trgmNFA->arcsCount++;
  1189. }
  1190. /*
  1191. * Can we make a valid trigram arc label from the given prefix and arc color?
  1192. *
  1193. * This is split out so that tests in addKey and addArc will stay in sync.
  1194. */
  1195. static bool
  1196. validArcLabel(TrgmStateKey *key, TrgmColor co)
  1197. {
  1198. /*
  1199. * We have to know full trigram in order to add outgoing arc. So we can't
  1200. * do it if prefix is ambiguous.
  1201. */
  1202. if (key->prefix.colors[0] == COLOR_UNKNOWN)
  1203. return false;
  1204. /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
  1205. Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
  1206. /* And we should not be called with an unknown arc color anytime */
  1207. Assert(co != COLOR_UNKNOWN);
  1208. /*
  1209. * We don't bother with making arcs representing three non-word
  1210. * characters, since that's useless for trigram extraction.
  1211. */
  1212. if (key->prefix.colors[0] == COLOR_BLANK &&
  1213. key->prefix.colors[1] == COLOR_BLANK &&
  1214. co == COLOR_BLANK)
  1215. return false;
  1216. /*
  1217. * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
  1218. * case doesn't correspond to any trigram the trigram extraction code
  1219. * would make. The nonblank-blank-blank case is also not possible with
  1220. * RPADDING = 1. (Note that in many cases we'd fail to generate such a
  1221. * trigram even if it were valid, for example processing "foo bar" will
  1222. * not result in considering the trigram "o ". So if you want to support
  1223. * RPADDING = 2, there's more to do than just twiddle this test.)
  1224. */
  1225. if (key->prefix.colors[0] != COLOR_BLANK &&
  1226. key->prefix.colors[1] == COLOR_BLANK)
  1227. return false;
  1228. /*
  1229. * Other combinations involving blank are valid, in particular we assume
  1230. * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
  1231. *
  1232. * Note: Using again the example "foo bar", we will not consider the
  1233. * trigram " b", though this trigram would be found by the trigram
  1234. * extraction code. Since we will find " ba", it doesn't seem worth
  1235. * trying to hack the algorithm to generate the additional trigram.
  1236. */
  1237. /* arc label is valid */
  1238. return true;
  1239. }
  1240. /*
  1241. * Get state of expanded graph for given state key,
  1242. * and queue the state for processing if it didn't already exist.
  1243. */
  1244. static TrgmState *
  1245. getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
  1246. {
  1247. TrgmState *state;
  1248. bool found;
  1249. state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
  1250. &found);
  1251. if (!found)
  1252. {
  1253. /* New state: initialize and queue it */
  1254. state->arcs = NIL;
  1255. state->enterKeys = NIL;
  1256. state->flags = 0;
  1257. /* states are initially given negative numbers */
  1258. state->snumber = -(++trgmNFA->nstates);
  1259. state->parent = NULL;
  1260. state->tentFlags = 0;
  1261. state->tentParent = NULL;
  1262. trgmNFA->queue = lappend(trgmNFA->queue, state);
  1263. }
  1264. return state;
  1265. }
  1266. /*
  1267. * Check if prefix1 "contains" prefix2.
  1268. *
  1269. * "contains" means that any exact prefix (with no ambiguity) that satisfies
  1270. * prefix2 also satisfies prefix1.
  1271. */
  1272. static bool
  1273. prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
  1274. {
  1275. if (prefix1->colors[1] == COLOR_UNKNOWN)
  1276. {
  1277. /* Fully ambiguous prefix contains everything */
  1278. return true;
  1279. }
  1280. else if (prefix1->colors[0] == COLOR_UNKNOWN)
  1281. {
  1282. /*
  1283. * Prefix with only first unknown color contains every prefix with
  1284. * same second color.
  1285. */
  1286. if (prefix1->colors[1] == prefix2->colors[1])
  1287. return true;
  1288. else
  1289. return false;
  1290. }
  1291. else
  1292. {
  1293. /* Exact prefix contains only the exact same prefix */
  1294. if (prefix1->colors[0] == prefix2->colors[0] &&
  1295. prefix1->colors[1] == prefix2->colors[1])
  1296. return true;
  1297. else
  1298. return false;
  1299. }
  1300. }
  1301. /*---------------------
  1302. * Subroutines for expanding color trigrams into regular trigrams (stage 3).
  1303. *---------------------
  1304. */
  1305. /*
  1306. * Get vector of all color trigrams in graph and select which of them
  1307. * to expand into simple trigrams.
  1308. *
  1309. * Returns true if OK, false if exhausted resource limits.
  1310. */
  1311. static bool
  1312. selectColorTrigrams(TrgmNFA *trgmNFA)
  1313. {
  1314. HASH_SEQ_STATUS scan_status;
  1315. int arcsCount = trgmNFA->arcsCount,
  1316. i;
  1317. TrgmState *state;
  1318. ColorTrgmInfo *colorTrgms;
  1319. int64 totalTrgmCount;
  1320. float4 totalTrgmPenalty;
  1321. int cnumber;
  1322. /* Collect color trigrams from all arcs */
  1323. colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
  1324. trgmNFA->colorTrgms = colorTrgms;
  1325. i = 0;
  1326. hash_seq_init(&scan_status, trgmNFA->states);
  1327. while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
  1328. {
  1329. ListCell *cell;
  1330. foreach(cell, state->arcs)
  1331. {
  1332. TrgmArc *arc = (TrgmArc *) lfirst(cell);
  1333. TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
  1334. ColorTrgmInfo *trgmInfo = &colorTrgms[i];
  1335. arcInfo->source = state;
  1336. arcInfo->target = arc->target;
  1337. trgmInfo->ctrgm = arc->ctrgm;
  1338. trgmInfo->cnumber = -1;
  1339. /* count and penalty will be set below */
  1340. trgmInfo->expanded = true;
  1341. trgmInfo->arcs = list_make1(arcInfo);
  1342. i++;
  1343. }
  1344. }
  1345. Assert(i == arcsCount);
  1346. /* Remove duplicates, merging their arcs lists */
  1347. if (arcsCount >= 2)
  1348. {
  1349. ColorTrgmInfo *p1,
  1350. *p2;
  1351. /* Sort trigrams to ease duplicate detection */
  1352. qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
  1353. /* p1 is probe point, p2 is last known non-duplicate. */
  1354. p2 = colorTrgms;
  1355. for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
  1356. {
  1357. if (colorTrgmInfoCmp(p1, p2) > 0)
  1358. {
  1359. p2++;
  1360. *p2 = *p1;
  1361. }
  1362. else
  1363. {
  1364. p2->arcs = list_concat(p2->arcs, p1->arcs);
  1365. }
  1366. }
  1367. trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
  1368. }
  1369. else
  1370. {
  1371. trgmNFA->colorTrgmsCount = arcsCount;
  1372. }
  1373. /*
  1374. * Count number of simple trigrams generated by each color trigram, and
  1375. * also compute a penalty value, which is the number of simple trigrams
  1376. * times a multiplier that depends on its whitespace content.
  1377. *
  1378. * Note: per-color-trigram counts cannot overflow an int so long as
  1379. * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
  1380. * 1290. However, the grand total totalTrgmCount might conceivably
  1381. * overflow an int, so we use int64 for that within this routine. Also,
  1382. * penalties are calculated in float4 arithmetic to avoid any overflow
  1383. * worries.
  1384. */
  1385. totalTrgmCount = 0;
  1386. totalTrgmPenalty = 0.0f;
  1387. for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
  1388. {
  1389. ColorTrgmInfo *trgmInfo = &colorTrgms[i];
  1390. int j,
  1391. count = 1,
  1392. typeIndex = 0;
  1393. for (j = 0; j < 3; j++)
  1394. {
  1395. TrgmColor c = trgmInfo->ctrgm.colors[j];
  1396. typeIndex *= 2;
  1397. if (c == COLOR_BLANK)
  1398. typeIndex++;
  1399. else
  1400. count *= trgmNFA->colorInfo[c].wordCharsCount;
  1401. }
  1402. trgmInfo->count = count;
  1403. totalTrgmCount += count;
  1404. trgmInfo->penalty = penalties[typeIndex] * (float4) count;
  1405. totalTrgmPenalty += trgmInfo->penalty;
  1406. }
  1407. /* Sort color trigrams in descending order of their penalties */
  1408. qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
  1409. colorTrgmInfoPenaltyCmp);
  1410. /*
  1411. * Remove color trigrams from the graph so long as total penalty of color
  1412. * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
  1413. * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
  1414. * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
  1415. * penalty, since those are the most promising for reducing the total
  1416. * penalty. When removing a color trigram we have to merge states
  1417. * connected by arcs labeled with that trigram. It's necessary to not
  1418. * merge initial and final states, because our graph becomes useless if
  1419. * that happens; so we cannot always remove the trigram we'd prefer to.
  1420. */
  1421. for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
  1422. {
  1423. ColorTrgmInfo *trgmInfo = &colorTrgms[i];
  1424. bool canRemove = true;
  1425. ListCell *cell;
  1426. /* Done if we've reached the target */
  1427. if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
  1428. break;
  1429. #ifdef TRGM_REGEXP_DEBUG
  1430. fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
  1431. trgmInfo->ctrgm.colors[0],
  1432. trgmInfo->ctrgm.colors[1],
  1433. trgmInfo->ctrgm.colors[2],
  1434. trgmInfo->penalty,
  1435. list_length(trgmInfo->arcs));
  1436. #endif
  1437. /*
  1438. * Does any arc of this color trigram connect initial and final
  1439. * states? If so we can't remove it.
  1440. */
  1441. foreach(cell, trgmInfo->arcs)
  1442. {
  1443. TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
  1444. TrgmState *source = arcInfo->source,
  1445. *target = arcInfo->target;
  1446. int source_flags,
  1447. target_flags;
  1448. #ifdef TRGM_REGEXP_DEBUG
  1449. fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
  1450. -target->snumber, target->flags,
  1451. -source->snumber, source->flags);
  1452. #endif
  1453. /* examine parent states, if any merging has already happened */
  1454. while (source->parent)
  1455. source = source->parent;
  1456. while (target->parent)
  1457. target = target->parent;
  1458. #ifdef TRGM_REGEXP_DEBUG
  1459. fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
  1460. -target->snumber, target->flags,
  1461. -source->snumber, source->flags);
  1462. #endif
  1463. /* we must also consider merges we are planning right now */
  1464. source_flags = source->flags | source->tentFlags;
  1465. while (source->tentParent)
  1466. {
  1467. source = source->tentParent;
  1468. source_flags |= source->flags | source->tentFlags;
  1469. }
  1470. target_flags = target->flags | target->tentFlags;
  1471. while (target->tentParent)
  1472. {
  1473. target = target->tentParent;
  1474. target_flags |= target->flags | target->tentFlags;
  1475. }
  1476. #ifdef TRGM_REGEXP_DEBUG
  1477. fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
  1478. -target->snumber, target_flags,
  1479. -source->snumber, source_flags);
  1480. #endif
  1481. /* would fully-merged state have both INIT and FIN set? */
  1482. if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
  1483. (TSTATE_INIT | TSTATE_FIN))
  1484. {
  1485. canRemove = false;
  1486. break;
  1487. }
  1488. /* ok so far, so remember planned merge */
  1489. if (source != target)
  1490. {
  1491. #ifdef TRGM_REGEXP_DEBUG
  1492. fprintf(stderr, " ... tentatively merging s%d into s%d\n",
  1493. -target->snumber, -source->snumber);
  1494. #endif
  1495. target->tentParent = source;
  1496. source->tentFlags |= target_flags;
  1497. }
  1498. }
  1499. /*
  1500. * We must reset all the tentFlags/tentParent fields before
  1501. * continuing. tentFlags could only have become set in states that
  1502. * are the source or parent or tentative parent of one of the current
  1503. * arcs; likewise tentParent could only have become set in states that
  1504. * are the target or parent or tentative parent of one of the current
  1505. * arcs. There might be some overlap between those sets, but if we
  1506. * clear tentFlags in target states as well as source states, we
  1507. * should be okay even if we visit a state as target before visiting
  1508. * it as a source.
  1509. */
  1510. foreach(cell, trgmInfo->arcs)
  1511. {
  1512. TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
  1513. TrgmState *source = arcInfo->source,
  1514. *target = arcInfo->target;
  1515. TrgmState *ttarget;
  1516. /* no need to touch previously-merged states */
  1517. while (source->parent)
  1518. source = source->parent;
  1519. while (target->parent)
  1520. target = target->parent;
  1521. while (source)
  1522. {
  1523. source->tentFlags = 0;
  1524. source = source->tentParent;
  1525. }
  1526. while ((ttarget = target->tentParent) != NULL)
  1527. {
  1528. target->tentParent = NULL;
  1529. target->tentFlags = 0; /* in case it was also a source */
  1530. target = ttarget;
  1531. }
  1532. }
  1533. /* Now, move on if we can't drop this trigram */
  1534. if (!canRemove)
  1535. {
  1536. #ifdef TRGM_REGEXP_DEBUG
  1537. fprintf(stderr, " ... not ok to merge\n");
  1538. #endif
  1539. continue;
  1540. }
  1541. /* OK, merge states linked by each arc labeled by the trigram */
  1542. foreach(cell, trgmInfo->arcs)
  1543. {
  1544. TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
  1545. TrgmState *source = arcInfo->source,
  1546. *target = arcInfo->target;
  1547. while (source->parent)
  1548. source = source->parent;
  1549. while (target->parent)
  1550. target = target->parent;
  1551. if (source != target)
  1552. {
  1553. #ifdef TRGM_REGEXP_DEBUG
  1554. fprintf(stderr, "merging s%d into s%d\n",
  1555. -target->snumber, -source->snumber);
  1556. #endif
  1557. mergeStates(source, target);
  1558. /* Assert we didn't merge initial and final states */
  1559. Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
  1560. (TSTATE_INIT | TSTATE_FIN));
  1561. }
  1562. }
  1563. /* Mark trigram unexpanded, and update totals */
  1564. trgmInfo->expanded = false;
  1565. totalTrgmCount -= trgmInfo->count;
  1566. totalTrgmPenalty -= trgmInfo->penalty;
  1567. }
  1568. /* Did we succeed in fitting into MAX_TRGM_COUNT? */
  1569. if (totalTrgmCount > MAX_TRGM_COUNT)
  1570. return false;
  1571. trgmNFA->totalTrgmCount = (int) totalTrgmCount;
  1572. /*
  1573. * Sort color trigrams by colors (will be useful for bsearch in packGraph)
  1574. * and enumerate the color trigrams that are expanded.
  1575. */
  1576. cnumber = 0;
  1577. qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
  1578. colorTrgmInfoCmp);
  1579. for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
  1580. {
  1581. if (colorTrgms[i].expanded)
  1582. {
  1583. colorTrgms[i].cnumber = cnumber;
  1584. cnumber++;
  1585. }
  1586. }
  1587. return true;
  1588. }
  1589. /*
  1590. * Expand selected color trigrams into regular trigrams.
  1591. *
  1592. * Returns the TRGM array to be passed to the index machinery.
  1593. * The array must be allocated in rcontext.
  1594. */
  1595. static TRGM *
  1596. expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
  1597. {
  1598. TRGM *trg;
  1599. trgm *p;
  1600. int i;
  1601. TrgmColorInfo blankColor;
  1602. trgm_mb_char blankChar;
  1603. /* Set up "blank" color structure containing a single zero character */
  1604. memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
  1605. blankColor.wordCharsCount = 1;
  1606. blankColor.wordChars = &blankChar;
  1607. /* Construct the trgm array */
  1608. trg = (TRGM *)
  1609. MemoryContextAllocZero(rcontext,
  1610. TRGMHDRSIZE +
  1611. trgmNFA->totalTrgmCount * sizeof(trgm));
  1612. trg->flag = ARRKEY;
  1613. SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
  1614. p = GETARR(trg);
  1615. for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
  1616. {
  1617. ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
  1618. TrgmColorInfo *c[3];
  1619. trgm_mb_char s[3];
  1620. int j,
  1621. i1,
  1622. i2,
  1623. i3;
  1624. /* Ignore any unexpanded trigrams ... */
  1625. if (!colorTrgm->expanded)
  1626. continue;
  1627. /* Get colors, substituting the dummy struct for COLOR_BLANK */
  1628. for (j = 0; j < 3; j++)
  1629. {
  1630. if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
  1631. c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
  1632. else
  1633. c[j] = &blankColor;
  1634. }
  1635. /* Iterate over all possible combinations of colors' characters */
  1636. for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
  1637. {
  1638. s[0] = c[0]->wordChars[i1];
  1639. for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
  1640. {
  1641. s[1] = c[1]->wordChars[i2];
  1642. for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
  1643. {
  1644. s[2] = c[2]->wordChars[i3];
  1645. fillTrgm(p, s);
  1646. p++;
  1647. }
  1648. }
  1649. }
  1650. }
  1651. return trg;
  1652. }
  1653. /*
  1654. * Convert trigram into trgm datatype.
  1655. */
  1656. static void
  1657. fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
  1658. {
  1659. char str[3 * MAX_MULTIBYTE_CHAR_LEN],
  1660. *p;
  1661. int i,
  1662. j;
  1663. /* Write multibyte string into "str" (we don't need null termination) */
  1664. p = str;
  1665. for (i = 0; i < 3; i++)
  1666. {
  1667. if (s[i].bytes[0] != 0)
  1668. {
  1669. for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
  1670. *p++ = s[i].bytes[j];
  1671. }
  1672. else
  1673. {
  1674. /* Emit a space in place of COLOR_BLANK */
  1675. *p++ = ' ';
  1676. }
  1677. }
  1678. /* Convert "str" to a standard trigram (possibly hashing it) */
  1679. compact_trigram(ptrgm, str, p - str);
  1680. }
  1681. /*
  1682. * Merge two states of graph.
  1683. */
  1684. static void
  1685. mergeStates(TrgmState *state1, TrgmState *state2)
  1686. {
  1687. Assert(state1 != state2);
  1688. Assert(!state1->parent);
  1689. Assert(!state2->parent);
  1690. /* state1 absorbs state2's flags */
  1691. state1->flags |= state2->flags;
  1692. /* state2, and indirectly all its children, become children of state1 */
  1693. state2->parent = state1;
  1694. }
  1695. /*
  1696. * Compare function for sorting of color trigrams by their colors.
  1697. */
  1698. static int
  1699. colorTrgmInfoCmp(const void *p1, const void *p2)
  1700. {
  1701. const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
  1702. const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
  1703. return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
  1704. }
  1705. /*
  1706. * Compare function for sorting color trigrams in descending order of
  1707. * their penalty fields.
  1708. */
  1709. static int
  1710. colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
  1711. {
  1712. float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
  1713. float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
  1714. if (penalty1 < penalty2)
  1715. return 1;
  1716. else if (penalty1 == penalty2)
  1717. return 0;
  1718. else
  1719. return -1;
  1720. }
  1721. /*---------------------
  1722. * Subroutines for packing the graph into final representation (stage 4).
  1723. *---------------------
  1724. */
  1725. /*
  1726. * Pack expanded graph into final representation.
  1727. *
  1728. * The result data must be allocated in rcontext.
  1729. */
  1730. static TrgmPackedGraph *
  1731. packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
  1732. {
  1733. int snumber = 2,
  1734. arcIndex,
  1735. arcsCount;
  1736. HASH_SEQ_STATUS scan_status;
  1737. TrgmState *state;
  1738. TrgmPackArcInfo *arcs,
  1739. *p1,
  1740. *p2;
  1741. TrgmPackedArc *packedArcs;
  1742. TrgmPackedGraph *result;
  1743. int i,
  1744. j;
  1745. /* Enumerate surviving states, giving init and fin reserved numbers */
  1746. hash_seq_init(&scan_status, trgmNFA->states);
  1747. while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
  1748. {
  1749. while (state->parent)
  1750. state = state->parent;
  1751. if (state->snumber < 0)
  1752. {
  1753. if (state->flags & TSTATE_INIT)
  1754. state->snumber = 0;
  1755. else if (state->flags & TSTATE_FIN)
  1756. state->snumber = 1;
  1757. else
  1758. {
  1759. state->snumber = snumber;
  1760. snumber++;
  1761. }
  1762. }
  1763. }
  1764. /* Collect array of all arcs */
  1765. arcs = (TrgmPackArcInfo *)
  1766. palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount);
  1767. arcIndex = 0;
  1768. hash_seq_init(&scan_status, trgmNFA->states);
  1769. while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
  1770. {
  1771. TrgmState *source = state;
  1772. ListCell *cell;
  1773. while (source->parent)
  1774. source = source->parent;
  1775. foreach(cell, state->arcs)
  1776. {
  1777. TrgmArc *arc = (TrgmArc *) lfirst(cell);
  1778. TrgmState *target = arc->target;
  1779. while (target->parent)
  1780. target = target->parent;
  1781. if (source->snumber != target->snumber)
  1782. {
  1783. ColorTrgmInfo *ctrgm;
  1784. ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
  1785. trgmNFA->colorTrgms,
  1786. trgmNFA->colorTrgmsCount,
  1787. sizeof(ColorTrgmInfo),
  1788. colorTrgmInfoCmp);
  1789. Assert(ctrgm != NULL);
  1790. Assert(ctrgm->expanded);
  1791. arcs[arcIndex].sourceState = source->snumber;
  1792. arcs[arcIndex].targetState = target->snumber;
  1793. arcs[arcIndex].colorTrgm = ctrgm->cnumber;
  1794. arcIndex++;
  1795. }
  1796. }
  1797. }
  1798. /* Sort arcs to ease duplicate detection */
  1799. qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
  1800. /* We could have duplicates because states were merged. Remove them. */
  1801. /* p1 is probe point, p2 is last known non-duplicate. */
  1802. p2 = arcs;
  1803. for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
  1804. {
  1805. if (packArcInfoCmp(p1, p2) > 0)
  1806. {
  1807. p2++;
  1808. *p2 = *p1;
  1809. }
  1810. }
  1811. arcsCount = (p2 - arcs) + 1;
  1812. /* Create packed representation */
  1813. result = (TrgmPackedGraph *)
  1814. MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
  1815. /* Pack color trigrams information */
  1816. result->colorTrigramsCount = 0;
  1817. for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
  1818. {
  1819. if (trgmNFA->colorTrgms[i].expanded)
  1820. result->colorTrigramsCount++;
  1821. }
  1822. result->colorTrigramGroups = (int *)
  1823. MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
  1824. j = 0;
  1825. for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
  1826. {
  1827. if (trgmNFA->colorTrgms[i].expanded)
  1828. {
  1829. result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
  1830. j++;
  1831. }
  1832. }
  1833. /* Pack states and arcs information */
  1834. result->statesCount = snumber;
  1835. result->states = (TrgmPackedState *)
  1836. MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
  1837. packedArcs = (TrgmPackedArc *)
  1838. MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
  1839. j = 0;
  1840. for (i = 0; i < snumber; i++)
  1841. {
  1842. int cnt = 0;
  1843. result->states[i].arcs = &packedArcs[j];
  1844. while (j < arcsCount && arcs[j].sourceState == i)
  1845. {
  1846. packedArcs[j].targetState = arcs[j].targetState;
  1847. packedArcs[j].colorTrgm = arcs[j].colorTrgm;
  1848. cnt++;
  1849. j++;
  1850. }
  1851. result->states[i].arcsCount = cnt;
  1852. }
  1853. /* Allocate working memory for trigramsMatchGraph() */
  1854. result->colorTrigramsActive = (bool *)
  1855. MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
  1856. result->statesActive = (bool *)
  1857. MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
  1858. result->statesQueue = (int *)
  1859. MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
  1860. return result;
  1861. }
  1862. /*
  1863. * Comparison function for sorting TrgmPackArcInfos.
  1864. *
  1865. * Compares arcs in following order: sourceState, colorTrgm, targetState.
  1866. */
  1867. static int
  1868. packArcInfoCmp(const void *a1, const void *a2)
  1869. {
  1870. const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
  1871. const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
  1872. if (p1->sourceState < p2->sourceState)
  1873. return -1;
  1874. if (p1->sourceState > p2->sourceState)
  1875. return 1;
  1876. if (p1->colorTrgm < p2->colorTrgm)
  1877. return -1;
  1878. if (p1->colorTrgm > p2->colorTrgm)
  1879. return 1;
  1880. if (p1->targetState < p2->targetState)
  1881. return -1;
  1882. if (p1->targetState > p2->targetState)
  1883. return 1;
  1884. return 0;
  1885. }
  1886. /*---------------------
  1887. * Debugging functions
  1888. *
  1889. * These are designed to emit GraphViz files.
  1890. *---------------------
  1891. */
  1892. #ifdef TRGM_REGEXP_DEBUG
  1893. /*
  1894. * Print initial NFA, in regexp library's representation
  1895. */
  1896. static void
  1897. printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
  1898. {
  1899. StringInfoData buf;
  1900. int nstates = pg_reg_getnumstates(regex);
  1901. int state;
  1902. int i;
  1903. initStringInfo(&buf);
  1904. appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
  1905. for (state = 0; state < nstates; state++)
  1906. {
  1907. regex_arc_t *arcs;
  1908. int i,
  1909. arcsCount;
  1910. appendStringInfo(&buf, "s%d", state);
  1911. if (pg_reg_getfinalstate(regex) == state)
  1912. appendStringInfoString(&buf, " [shape = doublecircle]");
  1913. appendStringInfoString(&buf, ";\n");
  1914. arcsCount = pg_reg_getnumoutarcs(regex, state);
  1915. arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
  1916. pg_reg_getoutarcs(regex, state, arcs, arcsCount);
  1917. for (i = 0; i < arcsCount; i++)
  1918. {
  1919. appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
  1920. state, arcs[i].to, arcs[i].co);
  1921. }
  1922. pfree(arcs);
  1923. }
  1924. appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
  1925. appendStringInfo(&buf, " initial -> s%d;\n",
  1926. pg_reg_getinitialstate(regex));
  1927. /* Print colors */
  1928. appendStringInfoString(&buf, " { rank = sink;\n");
  1929. appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
  1930. for (i = 0; i < ncolors; i++)
  1931. {
  1932. TrgmColorInfo *color = &colors[i];
  1933. int j;
  1934. appendStringInfo(&buf, "<br/>Color %d: ", i);
  1935. if (color->expandable)
  1936. {
  1937. for (j = 0; j < color->wordCharsCount; j++)
  1938. {
  1939. char s[MAX_MULTIBYTE_CHAR_LEN + 1];
  1940. memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
  1941. s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
  1942. appendStringInfoString(&buf, s);
  1943. }
  1944. }
  1945. else
  1946. appendStringInfoString(&buf, "not expandable");
  1947. appendStringInfoChar(&buf, '\n');
  1948. }
  1949. appendStringInfoString(&buf, " >];\n");
  1950. appendStringInfoString(&buf, " }\n");
  1951. appendStringInfoString(&buf, "}\n");
  1952. {
  1953. /* dot -Tpng -o /tmp/source.png < /tmp/source.dot */
  1954. FILE *fp = fopen("/tmp/source.dot", "w");
  1955. fprintf(fp, "%s", buf.data);
  1956. fclose(fp);
  1957. }
  1958. pfree(buf.data);
  1959. }
  1960. /*
  1961. * Print expanded graph.
  1962. */
  1963. static void
  1964. printTrgmNFA(TrgmNFA *trgmNFA)
  1965. {
  1966. StringInfoData buf;
  1967. HASH_SEQ_STATUS scan_status;
  1968. TrgmState *state;
  1969. TrgmState *initstate = NULL;
  1970. initStringInfo(&buf);
  1971. appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
  1972. hash_seq_init(&scan_status, trgmNFA->states);
  1973. while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
  1974. {
  1975. ListCell *cell;
  1976. appendStringInfo(&buf, "s%d", -state->snumber);
  1977. if (state->flags & TSTATE_FIN)
  1978. appendStringInfoString(&buf, " [shape = doublecircle]");
  1979. if (state->flags & TSTATE_INIT)
  1980. initstate = state;
  1981. appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
  1982. appendStringInfoString(&buf, ";\n");
  1983. foreach(cell, state->arcs)
  1984. {
  1985. TrgmArc *arc = (TrgmArc *) lfirst(cell);
  1986. appendStringInfo(&buf, " s%d -> s%d [label = \"",
  1987. -state->snumber, -arc->target->snumber);
  1988. printTrgmColor(&buf, arc->ctrgm.colors[0]);
  1989. appendStringInfoChar(&buf, ' ');
  1990. printTrgmColor(&buf, arc->ctrgm.colors[1]);
  1991. appendStringInfoChar(&buf, ' ');
  1992. printTrgmColor(&buf, arc->ctrgm.colors[2]);
  1993. appendStringInfoString(&buf, "\"];\n");
  1994. }
  1995. }
  1996. if (initstate)
  1997. {
  1998. appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
  1999. appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
  2000. }
  2001. appendStringInfoString(&buf, "}\n");
  2002. {
  2003. /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.dot */
  2004. FILE *fp = fopen("/tmp/transformed.dot", "w");
  2005. fprintf(fp, "%s", buf.data);
  2006. fclose(fp);
  2007. }
  2008. pfree(buf.data);
  2009. }
  2010. /*
  2011. * Print a TrgmColor readably.
  2012. */
  2013. static void
  2014. printTrgmColor(StringInfo buf, TrgmColor co)
  2015. {
  2016. if (co == COLOR_UNKNOWN)
  2017. appendStringInfoChar(buf, 'u');
  2018. else if (co == COLOR_BLANK)
  2019. appendStringInfoChar(buf, 'b');
  2020. else
  2021. appendStringInfo(buf, "%d", (int) co);
  2022. }
  2023. /*
  2024. * Print final packed representation of trigram-based expanded graph.
  2025. */
  2026. static void
  2027. printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
  2028. {
  2029. StringInfoData buf;
  2030. trgm *p;
  2031. int i;
  2032. initStringInfo(&buf);
  2033. appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
  2034. for (i = 0; i < packedGraph->statesCount; i++)
  2035. {
  2036. TrgmPackedState *state = &packedGraph->states[i];
  2037. int j;
  2038. appendStringInfo(&buf, " s%d", i);
  2039. if (i == 1)
  2040. appendStringInfoString(&buf, " [shape = doublecircle]");
  2041. appendStringInfo(&buf, " [label = <s%d>];\n", i);
  2042. for (j = 0; j < state->arcsCount; j++)
  2043. {
  2044. TrgmPackedArc *arc = &state->arcs[j];
  2045. appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
  2046. i, arc->targetState, arc->colorTrgm);
  2047. }
  2048. }
  2049. appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
  2050. appendStringInfo(&buf, " initial -> s%d;\n", 0);
  2051. /* Print trigrams */
  2052. appendStringInfoString(&buf, " { rank = sink;\n");
  2053. appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
  2054. p = GETARR(trigrams);
  2055. for (i = 0; i < packedGraph->colorTrigramsCount; i++)
  2056. {
  2057. int count = packedGraph->colorTrigramGroups[i];
  2058. int j;
  2059. appendStringInfo(&buf, "<br/>Trigram %d: ", i);
  2060. for (j = 0; j < count; j++)
  2061. {
  2062. if (j > 0)
  2063. appendStringInfoString(&buf, ", ");
  2064. /*
  2065. * XXX This representation is nice only for all-ASCII trigrams.
  2066. */
  2067. appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
  2068. p++;
  2069. }
  2070. }
  2071. appendStringInfoString(&buf, " >];\n");
  2072. appendStringInfoString(&buf, " }\n");
  2073. appendStringInfoString(&buf, "}\n");
  2074. {
  2075. /* dot -Tpng -o /tmp/packed.png < /tmp/packed.dot */
  2076. FILE *fp = fopen("/tmp/packed.dot", "w");
  2077. fprintf(fp, "%s", buf.data);
  2078. fclose(fp);
  2079. }
  2080. pfree(buf.data);
  2081. }
  2082. #endif /* TRGM_REGEXP_DEBUG */