You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ginget.c 52KB


  1. /*-------------------------------------------------------------------------
  2. *
  3. * ginget.c
  4. * fetch tuples from a GIN scan.
  5. *
  6. *
  7. * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  8. * Portions Copyright (c) 1994, Regents of the University of California
  9. *
  10. * IDENTIFICATION
  11. * src/backend/access/gin/ginget.c
  12. *-------------------------------------------------------------------------
  13. */
  14. #include "postgres.h"
  15. #include "access/gin_private.h"
  16. #include "access/relscan.h"
  17. #include "miscadmin.h"
  18. #include "storage/predicate.h"
  19. #include "utils/datum.h"
  20. #include "utils/memutils.h"
  21. #include "utils/rel.h"
  22. /* GUC parameter */
  23. int GinFuzzySearchLimit = 0;
  24. typedef struct pendingPosition
  25. {
  26. Buffer pendingBuffer;
  27. OffsetNumber firstOffset;
  28. OffsetNumber lastOffset;
  29. ItemPointerData item;
  30. bool *hasMatchKey;
  31. } pendingPosition;
  32. /*
  33. * Goes to the next page if current offset is outside of bounds
  34. */
  35. static bool
  36. moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
  37. {
  38. Page page = BufferGetPage(stack->buffer);
  39. if (stack->off > PageGetMaxOffsetNumber(page))
  40. {
  41. /*
  42. * We scanned the whole page, so we should take right page
  43. */
  44. if (GinPageRightMost(page))
  45. return false; /* no more pages */
  46. stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
  47. stack->blkno = BufferGetBlockNumber(stack->buffer);
  48. stack->off = FirstOffsetNumber;
  49. PredicateLockPage(btree->index, stack->blkno, snapshot);
  50. }
  51. return true;
  52. }
  53. /*
  54. * Scan all pages of a posting tree and save all its heap ItemPointers
  55. * in scanEntry->matchBitmap
  56. */
  57. static void
  58. scanPostingTree(Relation index, GinScanEntry scanEntry,
  59. BlockNumber rootPostingTree, Snapshot snapshot)
  60. {
  61. GinBtreeData btree;
  62. GinBtreeStack *stack;
  63. Buffer buffer;
  64. Page page;
  65. /* Descend to the leftmost leaf page */
  66. stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
  67. buffer = stack->buffer;
  68. IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
  69. freeGinBtreeStack(stack);
  70. /*
  71. * Loop iterates through all leaf pages of posting tree
  72. */
  73. for (;;)
  74. {
  75. page = BufferGetPage(buffer);
  76. if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
  77. {
  78. int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
  79. scanEntry->predictNumberResult += n;
  80. }
  81. if (GinPageRightMost(page))
  82. break; /* no more pages */
  83. buffer = ginStepRight(buffer, index, GIN_SHARE);
  84. }
  85. UnlockReleaseBuffer(buffer);
  86. }
  87. /*
  88. * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
  89. * match the search entry. This supports three different match modes:
  90. *
  91. * 1. Partial-match support: scan from current point until the
  92. * comparePartialFn says we're done.
  93. * 2. SEARCH_MODE_ALL: scan from current point (which should be first
  94. * key for the current attnum) until we hit null items or end of attnum
  95. * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
  96. * key for the current attnum) until we hit end of attnum
  97. *
  98. * Returns true if done, false if it's necessary to restart scan from scratch
  99. */
  100. static bool
  101. collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
  102. GinScanEntry scanEntry, Snapshot snapshot)
  103. {
  104. OffsetNumber attnum;
  105. Form_pg_attribute attr;
  106. /* Initialize empty bitmap result */
  107. scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
  108. /* Null query cannot partial-match anything */
  109. if (scanEntry->isPartialMatch &&
  110. scanEntry->queryCategory != GIN_CAT_NORM_KEY)
  111. return true;
  112. /* Locate tupdesc entry for key column (for attbyval/attlen data) */
  113. attnum = scanEntry->attnum;
  114. attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
  115. /*
  116. * Predicate lock entry leaf page, following pages will be locked by
  117. * moveRightIfItNeeded()
  118. */
  119. PredicateLockPage(btree->index, stack->buffer, snapshot);
  120. for (;;)
  121. {
  122. Page page;
  123. IndexTuple itup;
  124. Datum idatum;
  125. GinNullCategory icategory;
  126. /*
  127. * stack->off points to the interested entry, buffer is already locked
  128. */
  129. if (moveRightIfItNeeded(btree, stack, snapshot) == false)
  130. return true;
  131. page = BufferGetPage(stack->buffer);
  132. TestForOldSnapshot(snapshot, btree->index, page);
  133. itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
  134. /*
  135. * If tuple stores another attribute then stop scan
  136. */
  137. if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
  138. return true;
  139. /* Safe to fetch attribute value */
  140. idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
  141. /*
  142. * Check for appropriate scan stop conditions
  143. */
  144. if (scanEntry->isPartialMatch)
  145. {
  146. int32 cmp;
  147. /*
  148. * In partial match, stop scan at any null (including
  149. * placeholders); partial matches never match nulls
  150. */
  151. if (icategory != GIN_CAT_NORM_KEY)
  152. return true;
  153. /*----------
  154. * Check of partial match.
  155. * case cmp == 0 => match
  156. * case cmp > 0 => not match and finish scan
  157. * case cmp < 0 => not match and continue scan
  158. *----------
  159. */
  160. cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
  161. btree->ginstate->supportCollation[attnum - 1],
  162. scanEntry->queryKey,
  163. idatum,
  164. UInt16GetDatum(scanEntry->strategy),
  165. PointerGetDatum(scanEntry->extra_data)));
  166. if (cmp > 0)
  167. return true;
  168. else if (cmp < 0)
  169. {
  170. stack->off++;
  171. continue;
  172. }
  173. }
  174. else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
  175. {
  176. /*
  177. * In ALL mode, we are not interested in null items, so we can
  178. * stop if we get to a null-item placeholder (which will be the
  179. * last entry for a given attnum). We do want to include NULL_KEY
  180. * and EMPTY_ITEM entries, though.
  181. */
  182. if (icategory == GIN_CAT_NULL_ITEM)
  183. return true;
  184. }
  185. /*
  186. * OK, we want to return the TIDs listed in this entry.
  187. */
  188. if (GinIsPostingTree(itup))
  189. {
  190. BlockNumber rootPostingTree = GinGetPostingTree(itup);
  191. /*
  192. * We should unlock current page (but not unpin) during tree scan
  193. * to prevent deadlock with vacuum processes.
  194. *
  195. * We save current entry value (idatum) to be able to re-find our
  196. * tuple after re-locking
  197. */
  198. if (icategory == GIN_CAT_NORM_KEY)
  199. idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
  200. LockBuffer(stack->buffer, GIN_UNLOCK);
  201. /*
  202. * Acquire predicate lock on the posting tree. We already hold a
  203. * lock on the entry page, but insertions to the posting tree
  204. * don't check for conflicts on that level.
  205. */
  206. PredicateLockPage(btree->index, rootPostingTree, snapshot);
  207. /* Collect all the TIDs in this entry's posting tree */
  208. scanPostingTree(btree->index, scanEntry, rootPostingTree,
  209. snapshot);
  210. /*
  211. * We lock again the entry page and while it was unlocked insert
  212. * might have occurred, so we need to re-find our position.
  213. */
  214. LockBuffer(stack->buffer, GIN_SHARE);
  215. page = BufferGetPage(stack->buffer);
  216. if (!GinPageIsLeaf(page))
  217. {
  218. /*
  219. * Root page becomes non-leaf while we unlock it. We will
  220. * start again, this situation doesn't occur often - root can
  221. * became a non-leaf only once per life of index.
  222. */
  223. return false;
  224. }
  225. /* Search forward to re-find idatum */
  226. for (;;)
  227. {
  228. Datum newDatum;
  229. GinNullCategory newCategory;
  230. if (moveRightIfItNeeded(btree, stack, snapshot) == false)
  231. elog(ERROR, "lost saved point in index"); /* must not happen !!! */
  232. page = BufferGetPage(stack->buffer);
  233. itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
  234. if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
  235. elog(ERROR, "lost saved point in index"); /* must not happen !!! */
  236. newDatum = gintuple_get_key(btree->ginstate, itup,
  237. &newCategory);
  238. if (ginCompareEntries(btree->ginstate, attnum,
  239. newDatum, newCategory,
  240. idatum, icategory) == 0)
  241. break; /* Found! */
  242. stack->off++;
  243. }
  244. if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
  245. pfree(DatumGetPointer(idatum));
  246. }
  247. else
  248. {
  249. ItemPointer ipd;
  250. int nipd;
  251. ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
  252. tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
  253. scanEntry->predictNumberResult += GinGetNPosting(itup);
  254. pfree(ipd);
  255. }
  256. /*
  257. * Done with this entry, go to the next
  258. */
  259. stack->off++;
  260. }
  261. }
  262. /*
  263. * Start* functions setup beginning state of searches: finds correct buffer and pins it.
  264. */
  265. static void
  266. startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
  267. {
  268. GinBtreeData btreeEntry;
  269. GinBtreeStack *stackEntry;
  270. Page page;
  271. bool needUnlock;
  272. restartScanEntry:
  273. entry->buffer = InvalidBuffer;
  274. ItemPointerSetMin(&entry->curItem);
  275. entry->offset = InvalidOffsetNumber;
  276. if (entry->list)
  277. pfree(entry->list);
  278. entry->list = NULL;
  279. entry->nlist = 0;
  280. entry->matchBitmap = NULL;
  281. entry->matchResult = NULL;
  282. entry->reduceResult = false;
  283. entry->predictNumberResult = 0;
  284. /*
  285. * we should find entry, and begin scan of posting tree or just store
  286. * posting list in memory
  287. */
  288. ginPrepareEntryScan(&btreeEntry, entry->attnum,
  289. entry->queryKey, entry->queryCategory,
  290. ginstate);
  291. stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
  292. page = BufferGetPage(stackEntry->buffer);
  293. /* ginFindLeafPage() will have already checked snapshot age. */
  294. needUnlock = true;
  295. entry->isFinished = true;
  296. if (entry->isPartialMatch ||
  297. entry->queryCategory == GIN_CAT_EMPTY_QUERY)
  298. {
  299. /*
  300. * btreeEntry.findItem locates the first item >= given search key.
  301. * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
  302. * because of the way the GIN_CAT_EMPTY_QUERY category code is
  303. * assigned.) We scan forward from there and collect all TIDs needed
  304. * for the entry type.
  305. */
  306. btreeEntry.findItem(&btreeEntry, stackEntry);
  307. if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
  308. == false)
  309. {
  310. /*
  311. * GIN tree was seriously restructured, so we will cleanup all
  312. * found data and rescan. See comments near 'return false' in
  313. * collectMatchBitmap()
  314. */
  315. if (entry->matchBitmap)
  316. {
  317. if (entry->matchIterator)
  318. tbm_end_iterate(entry->matchIterator);
  319. entry->matchIterator = NULL;
  320. tbm_free(entry->matchBitmap);
  321. entry->matchBitmap = NULL;
  322. }
  323. LockBuffer(stackEntry->buffer, GIN_UNLOCK);
  324. freeGinBtreeStack(stackEntry);
  325. goto restartScanEntry;
  326. }
  327. if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
  328. {
  329. entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
  330. entry->isFinished = false;
  331. }
  332. }
  333. else if (btreeEntry.findItem(&btreeEntry, stackEntry))
  334. {
  335. IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
  336. if (GinIsPostingTree(itup))
  337. {
  338. BlockNumber rootPostingTree = GinGetPostingTree(itup);
  339. GinBtreeStack *stack;
  340. Page page;
  341. ItemPointerData minItem;
  342. /*
  343. * This is an equality scan, so lock the root of the posting tree.
  344. * It represents a lock on the exact key value, and covers all the
  345. * items in the posting tree.
  346. */
  347. PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
  348. /*
  349. * We should unlock entry page before touching posting tree to
  350. * prevent deadlocks with vacuum processes. Because entry is never
  351. * deleted from page and posting tree is never reduced to the
  352. * posting list, we can unlock page after getting BlockNumber of
  353. * root of posting tree.
  354. */
  355. LockBuffer(stackEntry->buffer, GIN_UNLOCK);
  356. needUnlock = false;
  357. stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
  358. rootPostingTree, snapshot);
  359. entry->buffer = stack->buffer;
  360. /*
  361. * We keep buffer pinned because we need to prevent deletion of
  362. * page during scan. See GIN's vacuum implementation. RefCount is
  363. * increased to keep buffer pinned after freeGinBtreeStack() call.
  364. */
  365. IncrBufferRefCount(entry->buffer);
  366. page = BufferGetPage(entry->buffer);
  367. /*
  368. * Load the first page into memory.
  369. */
  370. ItemPointerSetMin(&minItem);
  371. entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
  372. entry->predictNumberResult = stack->predictNumber * entry->nlist;
  373. LockBuffer(entry->buffer, GIN_UNLOCK);
  374. freeGinBtreeStack(stack);
  375. entry->isFinished = false;
  376. }
  377. else
  378. {
  379. /*
  380. * Lock the entry leaf page. This is more coarse-grained than
  381. * necessary, because it will conflict with any insertions that
  382. * land on the same leaf page, not only the exacty key we searched
  383. * for. But locking an individual tuple would require updating
  384. * that lock whenever it moves because of insertions or vacuums,
  385. * which seems too complicated.
  386. */
  387. PredicateLockPage(ginstate->index,
  388. BufferGetBlockNumber(stackEntry->buffer),
  389. snapshot);
  390. if (GinGetNPosting(itup) > 0)
  391. {
  392. entry->list = ginReadTuple(ginstate, entry->attnum, itup,
  393. &entry->nlist);
  394. entry->predictNumberResult = entry->nlist;
  395. entry->isFinished = false;
  396. }
  397. }
  398. }
  399. else
  400. {
  401. /*
  402. * No entry found. Predicate lock the leaf page, to lock the place
  403. * where the entry would've been, had there been one.
  404. */
  405. PredicateLockPage(ginstate->index,
  406. BufferGetBlockNumber(stackEntry->buffer), snapshot);
  407. }
  408. if (needUnlock)
  409. LockBuffer(stackEntry->buffer, GIN_UNLOCK);
  410. freeGinBtreeStack(stackEntry);
  411. }
  412. /*
  413. * Comparison function for scan entry indexes. Sorts by predictNumberResult,
  414. * least frequent items first.
  415. */
  416. static int
  417. entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
  418. {
  419. const GinScanKey key = (const GinScanKey) arg;
  420. int i1 = *(const int *) a1;
  421. int i2 = *(const int *) a2;
  422. uint32 n1 = key->scanEntry[i1]->predictNumberResult;
  423. uint32 n2 = key->scanEntry[i2]->predictNumberResult;
  424. if (n1 < n2)
  425. return -1;
  426. else if (n1 == n2)
  427. return 0;
  428. else
  429. return 1;
  430. }
  431. static void
  432. startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
  433. {
  434. MemoryContext oldCtx = CurrentMemoryContext;
  435. int i;
  436. int j;
  437. int *entryIndexes;
  438. ItemPointerSetMin(&key->curItem);
  439. key->curItemMatches = false;
  440. key->recheckCurItem = false;
  441. key->isFinished = false;
  442. /*
  443. * Divide the entries into two distinct sets: required and additional.
  444. * Additional entries are not enough for a match alone, without any items
  445. * from the required set, but are needed by the consistent function to
  446. * decide if an item matches. When scanning, we can skip over items from
  447. * additional entries that have no corresponding matches in any of the
  448. * required entries. That speeds up queries like "frequent & rare"
  449. * considerably, if the frequent term can be put in the additional set.
  450. *
  451. * There can be many legal ways to divide them entries into these two
  452. * sets. A conservative division is to just put everything in the required
  453. * set, but the more you can put in the additional set, the more you can
  454. * skip during the scan. To maximize skipping, we try to put as many
  455. * frequent items as possible into additional, and less frequent ones into
  456. * required. To do that, sort the entries by frequency
  457. * (predictNumberResult), and put entries into the required set in that
  458. * order, until the consistent function says that none of the remaining
  459. * entries can form a match, without any items from the required set. The
  460. * rest go to the additional set.
  461. */
  462. if (key->nentries > 1)
  463. {
  464. MemoryContextSwitchTo(so->tempCtx);
  465. entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
  466. for (i = 0; i < key->nentries; i++)
  467. entryIndexes[i] = i;
  468. qsort_arg(entryIndexes, key->nentries, sizeof(int),
  469. entryIndexByFrequencyCmp, key);
  470. for (i = 0; i < key->nentries - 1; i++)
  471. {
  472. /* Pass all entries <= i as FALSE, and the rest as MAYBE */
  473. for (j = 0; j <= i; j++)
  474. key->entryRes[entryIndexes[j]] = GIN_FALSE;
  475. for (j = i + 1; j < key->nentries; j++)
  476. key->entryRes[entryIndexes[j]] = GIN_MAYBE;
  477. if (key->triConsistentFn(key) == GIN_FALSE)
  478. break;
  479. }
  480. /* i is now the last required entry. */
  481. MemoryContextSwitchTo(so->keyCtx);
  482. key->nrequired = i + 1;
  483. key->nadditional = key->nentries - key->nrequired;
  484. key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
  485. key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
  486. j = 0;
  487. for (i = 0; i < key->nrequired; i++)
  488. key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
  489. for (i = 0; i < key->nadditional; i++)
  490. key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
  491. /* clean up after consistentFn calls (also frees entryIndexes) */
  492. MemoryContextReset(so->tempCtx);
  493. }
  494. else
  495. {
  496. MemoryContextSwitchTo(so->keyCtx);
  497. key->nrequired = 1;
  498. key->nadditional = 0;
  499. key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
  500. key->requiredEntries[0] = key->scanEntry[0];
  501. }
  502. MemoryContextSwitchTo(oldCtx);
  503. }
  504. static void
  505. startScan(IndexScanDesc scan)
  506. {
  507. GinScanOpaque so = (GinScanOpaque) scan->opaque;
  508. GinState *ginstate = &so->ginstate;
  509. uint32 i;
  510. for (i = 0; i < so->totalentries; i++)
  511. startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
  512. if (GinFuzzySearchLimit > 0)
  513. {
  514. /*
  515. * If all of keys more than threshold we will try to reduce result, we
  516. * hope (and only hope, for intersection operation of array our
  517. * supposition isn't true), that total result will not more than
  518. * minimal predictNumberResult.
  519. */
  520. bool reduce = true;
  521. for (i = 0; i < so->totalentries; i++)
  522. {
  523. if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
  524. {
  525. reduce = false;
  526. break;
  527. }
  528. }
  529. if (reduce)
  530. {
  531. for (i = 0; i < so->totalentries; i++)
  532. {
  533. so->entries[i]->predictNumberResult /= so->totalentries;
  534. so->entries[i]->reduceResult = true;
  535. }
  536. }
  537. }
  538. /*
  539. * Now that we have the estimates for the entry frequencies, finish
  540. * initializing the scan keys.
  541. */
  542. for (i = 0; i < so->nkeys; i++)
  543. startScanKey(ginstate, so, so->keys + i);
  544. }
  545. /*
  546. * Load the next batch of item pointers from a posting tree.
  547. *
  548. * Note that we copy the page into GinScanEntry->list array and unlock it, but
  549. * keep it pinned to prevent interference with vacuum.
  550. */
  551. static void
  552. entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
  553. ItemPointerData advancePast, Snapshot snapshot)
  554. {
  555. Page page;
  556. int i;
  557. bool stepright;
  558. if (!BufferIsValid(entry->buffer))
  559. {
  560. entry->isFinished = true;
  561. return;
  562. }
  563. /*
  564. * We have two strategies for finding the correct page: step right from
  565. * the current page, or descend the tree again from the root. If
  566. * advancePast equals the current item, the next matching item should be
  567. * on the next page, so we step right. Otherwise, descend from root.
  568. */
  569. if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
  570. {
  571. stepright = true;
  572. LockBuffer(entry->buffer, GIN_SHARE);
  573. }
  574. else
  575. {
  576. GinBtreeStack *stack;
  577. ReleaseBuffer(entry->buffer);
  578. /*
  579. * Set the search key, and find the correct leaf page.
  580. */
  581. if (ItemPointerIsLossyPage(&advancePast))
  582. {
  583. ItemPointerSet(&entry->btree.itemptr,
  584. GinItemPointerGetBlockNumber(&advancePast) + 1,
  585. FirstOffsetNumber);
  586. }
  587. else
  588. {
  589. ItemPointerSet(&entry->btree.itemptr,
  590. GinItemPointerGetBlockNumber(&advancePast),
  591. OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
  592. }
  593. entry->btree.fullScan = false;
  594. stack = ginFindLeafPage(&entry->btree, true, false, snapshot);
  595. /* we don't need the stack, just the buffer. */
  596. entry->buffer = stack->buffer;
  597. IncrBufferRefCount(entry->buffer);
  598. freeGinBtreeStack(stack);
  599. stepright = false;
  600. }
  601. elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
  602. GinItemPointerGetBlockNumber(&advancePast),
  603. GinItemPointerGetOffsetNumber(&advancePast),
  604. !stepright);
  605. page = BufferGetPage(entry->buffer);
  606. for (;;)
  607. {
  608. entry->offset = InvalidOffsetNumber;
  609. if (entry->list)
  610. {
  611. pfree(entry->list);
  612. entry->list = NULL;
  613. entry->nlist = 0;
  614. }
  615. if (stepright)
  616. {
  617. /*
  618. * We've processed all the entries on this page. If it was the
  619. * last page in the tree, we're done.
  620. */
  621. if (GinPageRightMost(page))
  622. {
  623. UnlockReleaseBuffer(entry->buffer);
  624. entry->buffer = InvalidBuffer;
  625. entry->isFinished = true;
  626. return;
  627. }
  628. /*
  629. * Step to next page, following the right link. then find the
  630. * first ItemPointer greater than advancePast.
  631. */
  632. entry->buffer = ginStepRight(entry->buffer,
  633. ginstate->index,
  634. GIN_SHARE);
  635. page = BufferGetPage(entry->buffer);
  636. }
  637. stepright = true;
  638. if (GinPageGetOpaque(page)->flags & GIN_DELETED)
  639. continue; /* page was deleted by concurrent vacuum */
  640. /*
  641. * The first item > advancePast might not be on this page, but
  642. * somewhere to the right, if the page was split, or a non-match from
  643. * another key in the query allowed us to skip some items from this
  644. * entry. Keep following the right-links until we re-find the correct
  645. * page.
  646. */
  647. if (!GinPageRightMost(page) &&
  648. ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
  649. {
  650. /*
  651. * the item we're looking is > the right bound of the page, so it
  652. * can't be on this page.
  653. */
  654. continue;
  655. }
  656. entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
  657. for (i = 0; i < entry->nlist; i++)
  658. {
  659. if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
  660. {
  661. entry->offset = i;
  662. if (GinPageRightMost(page))
  663. {
  664. /* after processing the copied items, we're done. */
  665. UnlockReleaseBuffer(entry->buffer);
  666. entry->buffer = InvalidBuffer;
  667. }
  668. else
  669. LockBuffer(entry->buffer, GIN_UNLOCK);
  670. return;
  671. }
  672. }
  673. }
  674. }
  675. #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
  676. #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
  677. /*
  678. * Sets entry->curItem to next heap item pointer > advancePast, for one entry
  679. * of one scan key, or sets entry->isFinished to true if there are no more.
  680. *
  681. * Item pointers are returned in ascending order.
  682. *
  683. * Note: this can return a "lossy page" item pointer, indicating that the
  684. * entry potentially matches all items on that heap page. However, it is
  685. * not allowed to return both a lossy page pointer and exact (regular)
  686. * item pointers for the same page. (Doing so would break the key-combination
  687. * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
  688. * current implementation this is guaranteed by the behavior of tidbitmaps.
  689. */
  690. static void
  691. entryGetItem(GinState *ginstate, GinScanEntry entry,
  692. ItemPointerData advancePast, Snapshot snapshot)
  693. {
  694. Assert(!entry->isFinished);
  695. Assert(!ItemPointerIsValid(&entry->curItem) ||
  696. ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
  697. if (entry->matchBitmap)
  698. {
  699. /* A bitmap result */
  700. BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
  701. OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
  702. bool gotitem = false;
  703. do
  704. {
  705. /*
  706. * If we've exhausted all items on this block, move to next block
  707. * in the bitmap.
  708. */
  709. while (entry->matchResult == NULL ||
  710. (entry->matchResult->ntuples >= 0 &&
  711. entry->offset >= entry->matchResult->ntuples) ||
  712. entry->matchResult->blockno < advancePastBlk ||
  713. (ItemPointerIsLossyPage(&advancePast) &&
  714. entry->matchResult->blockno == advancePastBlk))
  715. {
  716. entry->matchResult = tbm_iterate(entry->matchIterator);
  717. if (entry->matchResult == NULL)
  718. {
  719. ItemPointerSetInvalid(&entry->curItem);
  720. tbm_end_iterate(entry->matchIterator);
  721. entry->matchIterator = NULL;
  722. entry->isFinished = true;
  723. break;
  724. }
  725. /*
  726. * Reset counter to the beginning of entry->matchResult. Note:
  727. * entry->offset is still greater than matchResult->ntuples if
  728. * matchResult is lossy. So, on next call we will get next
  729. * result from TIDBitmap.
  730. */
  731. entry->offset = 0;
  732. }
  733. if (entry->isFinished)
  734. break;
  735. /*
  736. * We're now on the first page after advancePast which has any
  737. * items on it. If it's a lossy result, return that.
  738. */
  739. if (entry->matchResult->ntuples < 0)
  740. {
  741. ItemPointerSetLossyPage(&entry->curItem,
  742. entry->matchResult->blockno);
  743. /*
  744. * We might as well fall out of the loop; we could not
  745. * estimate number of results on this page to support correct
  746. * reducing of result even if it's enabled.
  747. */
  748. gotitem = true;
  749. break;
  750. }
  751. /*
  752. * Not a lossy page. Skip over any offsets <= advancePast, and
  753. * return that.
  754. */
  755. if (entry->matchResult->blockno == advancePastBlk)
  756. {
  757. /*
  758. * First, do a quick check against the last offset on the
  759. * page. If that's > advancePast, so are all the other
  760. * offsets.
  761. */
  762. if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
  763. {
  764. entry->offset = entry->matchResult->ntuples;
  765. continue;
  766. }
  767. /* Otherwise scan to find the first item > advancePast */
  768. while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
  769. entry->offset++;
  770. }
  771. ItemPointerSet(&entry->curItem,
  772. entry->matchResult->blockno,
  773. entry->matchResult->offsets[entry->offset]);
  774. entry->offset++;
  775. gotitem = true;
  776. } while (!gotitem || (entry->reduceResult == true && dropItem(entry)));
  777. }
  778. else if (!BufferIsValid(entry->buffer))
  779. {
  780. /*
  781. * A posting list from an entry tuple, or the last page of a posting
  782. * tree.
  783. */
  784. do
  785. {
  786. if (entry->offset >= entry->nlist)
  787. {
  788. ItemPointerSetInvalid(&entry->curItem);
  789. entry->isFinished = true;
  790. break;
  791. }
  792. entry->curItem = entry->list[entry->offset++];
  793. } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
  794. /* XXX: shouldn't we apply the fuzzy search limit here? */
  795. }
  796. else
  797. {
  798. /* A posting tree */
  799. do
  800. {
  801. /* If we've processed the current batch, load more items */
  802. while (entry->offset >= entry->nlist)
  803. {
  804. entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
  805. if (entry->isFinished)
  806. {
  807. ItemPointerSetInvalid(&entry->curItem);
  808. return;
  809. }
  810. }
  811. entry->curItem = entry->list[entry->offset++];
  812. } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
  813. (entry->reduceResult == true && dropItem(entry)));
  814. }
  815. }
  816. /*
  817. * Identify the "current" item among the input entry streams for this scan key
  818. * that is greater than advancePast, and test whether it passes the scan key
  819. * qual condition.
  820. *
  821. * The current item is the smallest curItem among the inputs. key->curItem
  822. * is set to that value. key->curItemMatches is set to indicate whether that
  823. * TID passes the consistentFn test. If so, key->recheckCurItem is set true
  824. * iff recheck is needed for this item pointer (including the case where the
  825. * item pointer is a lossy page pointer).
  826. *
  827. * If all entry streams are exhausted, sets key->isFinished to true.
  828. *
  829. * Item pointers must be returned in ascending order.
  830. *
  831. * Note: this can return a "lossy page" item pointer, indicating that the
  832. * key potentially matches all items on that heap page. However, it is
  833. * not allowed to return both a lossy page pointer and exact (regular)
  834. * item pointers for the same page. (Doing so would break the key-combination
  835. * logic in scanGetItem.)
  836. */
  837. static void
  838. keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
  839. ItemPointerData advancePast, Snapshot snapshot)
  840. {
  841. ItemPointerData minItem;
  842. ItemPointerData curPageLossy;
  843. uint32 i;
  844. bool haveLossyEntry;
  845. GinScanEntry entry;
  846. GinTernaryValue res;
  847. MemoryContext oldCtx;
  848. bool allFinished;
  849. Assert(!key->isFinished);
  850. /*
  851. * We might have already tested this item; if so, no need to repeat work.
  852. * (Note: the ">" case can happen, if advancePast is exact but we
  853. * previously had to set curItem to a lossy-page pointer.)
  854. */
  855. if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
  856. return;
  857. /*
  858. * Find the minimum item > advancePast among the active entry streams.
  859. *
  860. * Note: a lossy-page entry is encoded by a ItemPointer with max value for
  861. * offset (0xffff), so that it will sort after any exact entries for the
  862. * same page. So we'll prefer to return exact pointers not lossy
  863. * pointers, which is good.
  864. */
  865. ItemPointerSetMax(&minItem);
  866. allFinished = true;
  867. for (i = 0; i < key->nrequired; i++)
  868. {
  869. entry = key->requiredEntries[i];
  870. if (entry->isFinished)
  871. continue;
  872. /*
  873. * Advance this stream if necessary.
  874. *
  875. * In particular, since entry->curItem was initialized with
  876. * ItemPointerSetMin, this ensures we fetch the first item for each
  877. * entry on the first call.
  878. */
  879. if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
  880. {
  881. entryGetItem(ginstate, entry, advancePast, snapshot);
  882. if (entry->isFinished)
  883. continue;
  884. }
  885. allFinished = false;
  886. if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
  887. minItem = entry->curItem;
  888. }
  889. if (allFinished)
  890. {
  891. /* all entries are finished */
  892. key->isFinished = true;
  893. return;
  894. }
  895. /*
  896. * Ok, we now know that there are no matches < minItem.
  897. *
  898. * If minItem is lossy, it means that there were no exact items on the
  899. * page among requiredEntries, because lossy pointers sort after exact
  900. * items. However, there might be exact items for the same page among
  901. * additionalEntries, so we mustn't advance past them.
  902. */
  903. if (ItemPointerIsLossyPage(&minItem))
  904. {
  905. if (GinItemPointerGetBlockNumber(&advancePast) <
  906. GinItemPointerGetBlockNumber(&minItem))
  907. {
  908. ItemPointerSet(&advancePast,
  909. GinItemPointerGetBlockNumber(&minItem),
  910. InvalidOffsetNumber);
  911. }
  912. }
  913. else
  914. {
  915. Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
  916. ItemPointerSet(&advancePast,
  917. GinItemPointerGetBlockNumber(&minItem),
  918. OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
  919. }
  920. /*
  921. * We might not have loaded all the entry streams for this TID yet. We
  922. * could call the consistent function, passing MAYBE for those entries, to
  923. * see if it can decide if this TID matches based on the information we
  924. * have. But if the consistent-function is expensive, and cannot in fact
  925. * decide with partial information, that could be a big loss. So, load all
  926. * the additional entries, before calling the consistent function.
  927. */
  928. for (i = 0; i < key->nadditional; i++)
  929. {
  930. entry = key->additionalEntries[i];
  931. if (entry->isFinished)
  932. continue;
  933. if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
  934. {
  935. entryGetItem(ginstate, entry, advancePast, snapshot);
  936. if (entry->isFinished)
  937. continue;
  938. }
  939. /*
  940. * Normally, none of the items in additionalEntries can have a curItem
  941. * larger than minItem. But if minItem is a lossy page, then there
  942. * might be exact items on the same page among additionalEntries.
  943. */
  944. if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
  945. {
  946. Assert(ItemPointerIsLossyPage(&minItem));
  947. minItem = entry->curItem;
  948. }
  949. }
  950. /*
  951. * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
  952. * and perform consistentFn test.
  953. *
  954. * Lossy-page entries pose a problem, since we don't know the correct
  955. * entryRes state to pass to the consistentFn, and we also don't know what
  956. * its combining logic will be (could be AND, OR, or even NOT). If the
  957. * logic is OR then the consistentFn might succeed for all items in the
  958. * lossy page even when none of the other entries match.
  959. *
  960. * Our strategy is to call the tri-state consistent function, with the
  961. * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
  962. * returns FALSE, none of the lossy items alone are enough for a match, so
  963. * we don't need to return a lossy-page pointer. Otherwise, return a
  964. * lossy-page pointer to indicate that the whole heap page must be
  965. * checked. (On subsequent calls, we'll do nothing until minItem is past
  966. * the page altogether, thus ensuring that we never return both regular
  967. * and lossy pointers for the same page.)
  968. *
  969. * An exception is that it doesn't matter what we pass for lossy pointers
  970. * in "hidden" entries, because the consistentFn's result can't depend on
  971. * them. We could pass them as MAYBE as well, but if we're using the
  972. * "shim" implementation of a tri-state consistent function (see
  973. * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
  974. * them as true.
  975. *
  976. * Note that only lossy-page entries pointing to the current item's page
  977. * should trigger this processing; we might have future lossy pages in the
  978. * entry array, but they aren't relevant yet.
  979. */
  980. key->curItem = minItem;
  981. ItemPointerSetLossyPage(&curPageLossy,
  982. GinItemPointerGetBlockNumber(&key->curItem));
  983. haveLossyEntry = false;
  984. for (i = 0; i < key->nentries; i++)
  985. {
  986. entry = key->scanEntry[i];
  987. if (entry->isFinished == false &&
  988. ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
  989. {
  990. if (i < key->nuserentries)
  991. key->entryRes[i] = GIN_MAYBE;
  992. else
  993. key->entryRes[i] = GIN_TRUE;
  994. haveLossyEntry = true;
  995. }
  996. else
  997. key->entryRes[i] = GIN_FALSE;
  998. }
  999. /* prepare for calling consistentFn in temp context */
  1000. oldCtx = MemoryContextSwitchTo(tempCtx);
  1001. if (haveLossyEntry)
  1002. {
  1003. /* Have lossy-page entries, so see if whole page matches */
  1004. res = key->triConsistentFn(key);
  1005. if (res == GIN_TRUE || res == GIN_MAYBE)
  1006. {
  1007. /* Yes, so clean up ... */
  1008. MemoryContextSwitchTo(oldCtx);
  1009. MemoryContextReset(tempCtx);
  1010. /* and return lossy pointer for whole page */
  1011. key->curItem = curPageLossy;
  1012. key->curItemMatches = true;
  1013. key->recheckCurItem = true;
  1014. return;
  1015. }
  1016. }
  1017. /*
  1018. * At this point we know that we don't need to return a lossy whole-page
  1019. * pointer, but we might have matches for individual exact item pointers,
  1020. * possibly in combination with a lossy pointer. Pass lossy pointers as
  1021. * MAYBE to the ternary consistent function, to let it decide if this
  1022. * tuple satisfies the overall key, even though we don't know if the lossy
  1023. * entries match.
  1024. *
  1025. * Prepare entryRes array to be passed to consistentFn.
  1026. */
  1027. for (i = 0; i < key->nentries; i++)
  1028. {
  1029. entry = key->scanEntry[i];
  1030. if (entry->isFinished)
  1031. key->entryRes[i] = GIN_FALSE;
  1032. #if 0
  1033. /*
  1034. * This case can't currently happen, because we loaded all the entries
  1035. * for this item earlier.
  1036. */
  1037. else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
  1038. key->entryRes[i] = GIN_MAYBE;
  1039. #endif
  1040. else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
  1041. key->entryRes[i] = GIN_MAYBE;
  1042. else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
  1043. key->entryRes[i] = GIN_TRUE;
  1044. else
  1045. key->entryRes[i] = GIN_FALSE;
  1046. }
  1047. res = key->triConsistentFn(key);
  1048. switch (res)
  1049. {
  1050. case GIN_TRUE:
  1051. key->curItemMatches = true;
  1052. /* triConsistentFn set recheckCurItem */
  1053. break;
  1054. case GIN_FALSE:
  1055. key->curItemMatches = false;
  1056. break;
  1057. case GIN_MAYBE:
  1058. key->curItemMatches = true;
  1059. key->recheckCurItem = true;
  1060. break;
  1061. default:
  1062. /*
  1063. * the 'default' case shouldn't happen, but if the consistent
  1064. * function returns something bogus, this is the safe result
  1065. */
  1066. key->curItemMatches = true;
  1067. key->recheckCurItem = true;
  1068. break;
  1069. }
  1070. /*
  1071. * We have a tuple, and we know if it matches or not. If it's a non-match,
  1072. * we could continue to find the next matching tuple, but let's break out
  1073. * and give scanGetItem a chance to advance the other keys. They might be
  1074. * able to skip past to a much higher TID, allowing us to save work.
  1075. */
  1076. /* clean up after consistentFn calls */
  1077. MemoryContextSwitchTo(oldCtx);
  1078. MemoryContextReset(tempCtx);
  1079. }
  1080. /*
  1081. * Get next heap item pointer (after advancePast) from scan.
  1082. * Returns true if anything found.
  1083. * On success, *item and *recheck are set.
  1084. *
  1085. * Note: this is very nearly the same logic as in keyGetItem(), except
  1086. * that we know the keys are to be combined with AND logic, whereas in
  1087. * keyGetItem() the combination logic is known only to the consistentFn.
  1088. */
  1089. static bool
  1090. scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
  1091. ItemPointerData *item, bool *recheck)
  1092. {
  1093. GinScanOpaque so = (GinScanOpaque) scan->opaque;
  1094. uint32 i;
  1095. bool match;
  1096. /*----------
  1097. * Advance the scan keys in lock-step, until we find an item that matches
  1098. * all the keys. If any key reports isFinished, meaning its subset of the
  1099. * entries is exhausted, we can stop. Otherwise, set *item to the next
  1100. * matching item.
  1101. *
  1102. * This logic works only if a keyGetItem stream can never contain both
  1103. * exact and lossy pointers for the same page. Else we could have a
  1104. * case like
  1105. *
  1106. * stream 1 stream 2
  1107. * ... ...
  1108. * 42/6 42/7
  1109. * 50/1 42/0xffff
  1110. * ... ...
  1111. *
  1112. * We would conclude that 42/6 is not a match and advance stream 1,
  1113. * thus never detecting the match to the lossy pointer in stream 2.
  1114. * (keyGetItem has a similar problem versus entryGetItem.)
  1115. *----------
  1116. */
  1117. do
  1118. {
  1119. ItemPointerSetMin(item);
  1120. match = true;
  1121. for (i = 0; i < so->nkeys && match; i++)
  1122. {
  1123. GinScanKey key = so->keys + i;
  1124. /* Fetch the next item for this key that is > advancePast. */
  1125. keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
  1126. scan->xs_snapshot);
  1127. if (key->isFinished)
  1128. return false;
  1129. /*
  1130. * If it's not a match, we can immediately conclude that nothing
  1131. * <= this item matches, without checking the rest of the keys.
  1132. */
  1133. if (!key->curItemMatches)
  1134. {
  1135. advancePast = key->curItem;
  1136. match = false;
  1137. break;
  1138. }
  1139. /*
  1140. * It's a match. We can conclude that nothing < matches, so the
  1141. * other key streams can skip to this item.
  1142. *
  1143. * Beware of lossy pointers, though; from a lossy pointer, we can
  1144. * only conclude that nothing smaller than this *block* matches.
  1145. */
  1146. if (ItemPointerIsLossyPage(&key->curItem))
  1147. {
  1148. if (GinItemPointerGetBlockNumber(&advancePast) <
  1149. GinItemPointerGetBlockNumber(&key->curItem))
  1150. {
  1151. ItemPointerSet(&advancePast,
  1152. GinItemPointerGetBlockNumber(&key->curItem),
  1153. InvalidOffsetNumber);
  1154. }
  1155. }
  1156. else
  1157. {
  1158. Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
  1159. ItemPointerSet(&advancePast,
  1160. GinItemPointerGetBlockNumber(&key->curItem),
  1161. OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
  1162. }
  1163. /*
  1164. * If this is the first key, remember this location as a potential
  1165. * match, and proceed to check the rest of the keys.
  1166. *
  1167. * Otherwise, check if this is the same item that we checked the
  1168. * previous keys for (or a lossy pointer for the same page). If
  1169. * not, loop back to check the previous keys for this item (we
  1170. * will check this key again too, but keyGetItem returns quickly
  1171. * for that)
  1172. */
  1173. if (i == 0)
  1174. {
  1175. *item = key->curItem;
  1176. }
  1177. else
  1178. {
  1179. if (ItemPointerIsLossyPage(&key->curItem) ||
  1180. ItemPointerIsLossyPage(item))
  1181. {
  1182. Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
  1183. match = (GinItemPointerGetBlockNumber(&key->curItem) ==
  1184. GinItemPointerGetBlockNumber(item));
  1185. }
  1186. else
  1187. {
  1188. Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
  1189. match = (ginCompareItemPointers(&key->curItem, item) == 0);
  1190. }
  1191. }
  1192. }
  1193. } while (!match);
  1194. Assert(!ItemPointerIsMin(item));
  1195. /*
  1196. * Now *item contains the first ItemPointer after previous result that
  1197. * satisfied all the keys for that exact TID, or a lossy reference to the
  1198. * same page.
  1199. *
  1200. * We must return recheck = true if any of the keys are marked recheck.
  1201. */
  1202. *recheck = false;
  1203. for (i = 0; i < so->nkeys; i++)
  1204. {
  1205. GinScanKey key = so->keys + i;
  1206. if (key->recheckCurItem)
  1207. {
  1208. *recheck = true;
  1209. break;
  1210. }
  1211. }
  1212. return true;
  1213. }
  1214. /*
  1215. * Functions for scanning the pending list
  1216. */
  1217. /*
  1218. * Get ItemPointer of next heap row to be checked from pending list.
  1219. * Returns false if there are no more. On pages with several heap rows
  1220. * it returns each row separately, on page with part of heap row returns
  1221. * per page data. pos->firstOffset and pos->lastOffset are set to identify
  1222. * the range of pending-list tuples belonging to this heap row.
  1223. *
  1224. * The pendingBuffer is presumed pinned and share-locked on entry, and is
  1225. * pinned and share-locked on success exit. On failure exit it's released.
  1226. */
  1227. static bool
  1228. scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
  1229. {
  1230. OffsetNumber maxoff;
  1231. Page page;
  1232. IndexTuple itup;
  1233. ItemPointerSetInvalid(&pos->item);
  1234. for (;;)
  1235. {
  1236. page = BufferGetPage(pos->pendingBuffer);
  1237. TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
  1238. maxoff = PageGetMaxOffsetNumber(page);
  1239. if (pos->firstOffset > maxoff)
  1240. {
  1241. BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
  1242. if (blkno == InvalidBlockNumber)
  1243. {
  1244. UnlockReleaseBuffer(pos->pendingBuffer);
  1245. pos->pendingBuffer = InvalidBuffer;
  1246. return false;
  1247. }
  1248. else
  1249. {
  1250. /*
  1251. * Here we must prevent deletion of next page by insertcleanup
  1252. * process, which may be trying to obtain exclusive lock on
  1253. * current page. So, we lock next page before releasing the
  1254. * current one
  1255. */
  1256. Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
  1257. LockBuffer(tmpbuf, GIN_SHARE);
  1258. UnlockReleaseBuffer(pos->pendingBuffer);
  1259. pos->pendingBuffer = tmpbuf;
  1260. pos->firstOffset = FirstOffsetNumber;
  1261. }
  1262. }
  1263. else
  1264. {
  1265. itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
  1266. pos->item = itup->t_tid;
  1267. if (GinPageHasFullRow(page))
  1268. {
  1269. /*
  1270. * find itempointer to the next row
  1271. */
  1272. for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
  1273. {
  1274. itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
  1275. if (!ItemPointerEquals(&pos->item, &itup->t_tid))
  1276. break;
  1277. }
  1278. }
  1279. else
  1280. {
  1281. /*
  1282. * All itempointers are the same on this page
  1283. */
  1284. pos->lastOffset = maxoff + 1;
  1285. }
  1286. /*
  1287. * Now pos->firstOffset points to the first tuple of current heap
  1288. * row, pos->lastOffset points to the first tuple of next heap row
  1289. * (or to the end of page)
  1290. */
  1291. break;
  1292. }
  1293. }
  1294. return true;
  1295. }
  1296. /*
  1297. * Scan pending-list page from current tuple (off) up till the first of:
  1298. * - match is found (then returns true)
  1299. * - no later match is possible
  1300. * - tuple's attribute number is not equal to entry's attrnum
  1301. * - reach end of page
  1302. *
  1303. * datum[]/category[]/datumExtracted[] arrays are used to cache the results
  1304. * of gintuple_get_key() on the current page.
  1305. */
  1306. static bool
  1307. matchPartialInPendingList(GinState *ginstate, Page page,
  1308. OffsetNumber off, OffsetNumber maxoff,
  1309. GinScanEntry entry,
  1310. Datum *datum, GinNullCategory *category,
  1311. bool *datumExtracted)
  1312. {
  1313. IndexTuple itup;
  1314. int32 cmp;
  1315. /* Partial match to a null is not possible */
  1316. if (entry->queryCategory != GIN_CAT_NORM_KEY)
  1317. return false;
  1318. while (off < maxoff)
  1319. {
  1320. itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
  1321. if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
  1322. return false;
  1323. if (datumExtracted[off - 1] == false)
  1324. {
  1325. datum[off - 1] = gintuple_get_key(ginstate, itup,
  1326. &category[off - 1]);
  1327. datumExtracted[off - 1] = true;
  1328. }
  1329. /* Once we hit nulls, no further match is possible */
  1330. if (category[off - 1] != GIN_CAT_NORM_KEY)
  1331. return false;
  1332. /*----------
  1333. * Check partial match.
  1334. * case cmp == 0 => match
  1335. * case cmp > 0 => not match and end scan (no later match possible)
  1336. * case cmp < 0 => not match and continue scan
  1337. *----------
  1338. */
  1339. cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
  1340. ginstate->supportCollation[entry->attnum - 1],
  1341. entry->queryKey,
  1342. datum[off - 1],
  1343. UInt16GetDatum(entry->strategy),
  1344. PointerGetDatum(entry->extra_data)));
  1345. if (cmp == 0)
  1346. return true;
  1347. else if (cmp > 0)
  1348. return false;
  1349. off++;
  1350. }
  1351. return false;
  1352. }
  1353. /*
  1354. * Set up the entryRes array for each key by looking at
  1355. * every entry for current heap row in pending list.
  1356. *
  1357. * Returns true if each scan key has at least one entryRes match.
  1358. * This corresponds to the situations where the normal index search will
  1359. * try to apply the key's consistentFn. (A tuple not meeting that requirement
  1360. * cannot be returned by the normal search since no entry stream will
  1361. * source its TID.)
  1362. *
  1363. * The pendingBuffer is presumed pinned and share-locked on entry.
  1364. */
  1365. static bool
  1366. collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
  1367. {
  1368. GinScanOpaque so = (GinScanOpaque) scan->opaque;
  1369. OffsetNumber attrnum;
  1370. Page page;
  1371. IndexTuple itup;
  1372. int i,
  1373. j;
  1374. /*
  1375. * Reset all entryRes and hasMatchKey flags
  1376. */
  1377. for (i = 0; i < so->nkeys; i++)
  1378. {
  1379. GinScanKey key = so->keys + i;
  1380. memset(key->entryRes, GIN_FALSE, key->nentries);
  1381. }
  1382. memset(pos->hasMatchKey, false, so->nkeys);
  1383. /*
  1384. * Outer loop iterates over multiple pending-list pages when a single heap
  1385. * row has entries spanning those pages.
  1386. */
  1387. for (;;)
  1388. {
  1389. Datum datum[BLCKSZ / sizeof(IndexTupleData)];
  1390. GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
  1391. bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
  1392. Assert(pos->lastOffset > pos->firstOffset);
  1393. memset(datumExtracted + pos->firstOffset - 1, 0,
  1394. sizeof(bool) * (pos->lastOffset - pos->firstOffset));
  1395. page = BufferGetPage(pos->pendingBuffer);
  1396. TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
  1397. for (i = 0; i < so->nkeys; i++)
  1398. {
  1399. GinScanKey key = so->keys + i;
  1400. for (j = 0; j < key->nentries; j++)
  1401. {
  1402. GinScanEntry entry = key->scanEntry[j];
  1403. OffsetNumber StopLow = pos->firstOffset,
  1404. StopHigh = pos->lastOffset,
  1405. StopMiddle;
  1406. /* If already matched on earlier page, do no extra work */
  1407. if (key->entryRes[j])
  1408. continue;
  1409. /*
  1410. * Interesting tuples are from pos->firstOffset to
  1411. * pos->lastOffset and they are ordered by (attnum, Datum) as
  1412. * it's done in entry tree. So we can use binary search to
  1413. * avoid linear scanning.
  1414. */
  1415. while (StopLow < StopHigh)
  1416. {
  1417. int res;
  1418. StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
  1419. itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
  1420. attrnum = gintuple_get_attrnum(&so->ginstate, itup);
  1421. if (key->attnum < attrnum)
  1422. {
  1423. StopHigh = StopMiddle;
  1424. continue;
  1425. }
  1426. if (key->attnum > attrnum)
  1427. {
  1428. StopLow = StopMiddle + 1;
  1429. continue;
  1430. }
  1431. if (datumExtracted[StopMiddle - 1] == false)
  1432. {
  1433. datum[StopMiddle - 1] =
  1434. gintuple_get_key(&so->ginstate, itup,
  1435. &category[StopMiddle - 1]);
  1436. datumExtracted[StopMiddle - 1] = true;
  1437. }
  1438. if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
  1439. {
  1440. /* special behavior depending on searchMode */
  1441. if (entry->searchMode == GIN_SEARCH_MODE_ALL)
  1442. {
  1443. /* match anything except NULL_ITEM */
  1444. if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
  1445. res = -1;
  1446. else
  1447. res = 0;
  1448. }
  1449. else
  1450. {
  1451. /* match everything */
  1452. res = 0;
  1453. }
  1454. }
  1455. else
  1456. {
  1457. res = ginCompareEntries(&so->ginstate,
  1458. entry->attnum,
  1459. entry->queryKey,
  1460. entry->queryCategory,
  1461. datum[StopMiddle - 1],
  1462. category[StopMiddle - 1]);
  1463. }
  1464. if (res == 0)
  1465. {
  1466. /*
  1467. * Found exact match (there can be only one, except in
  1468. * EMPTY_QUERY mode).
  1469. *
  1470. * If doing partial match, scan forward from here to
  1471. * end of page to check for matches.
  1472. *
  1473. * See comment above about tuple's ordering.
  1474. */
  1475. if (entry->isPartialMatch)
  1476. key->entryRes[j] =
  1477. matchPartialInPendingList(&so->ginstate,
  1478. page,
  1479. StopMiddle,
  1480. pos->lastOffset,
  1481. entry,
  1482. datum,
  1483. category,
  1484. datumExtracted);
  1485. else
  1486. key->entryRes[j] = true;
  1487. /* done with binary search */
  1488. break;
  1489. }
  1490. else if (res < 0)
  1491. StopHigh = StopMiddle;
  1492. else
  1493. StopLow = StopMiddle + 1;
  1494. }
  1495. if (StopLow >= StopHigh && entry->isPartialMatch)
  1496. {
  1497. /*
  1498. * No exact match on this page. If doing partial match,
  1499. * scan from the first tuple greater than target value to
  1500. * end of page. Note that since we don't remember whether
  1501. * the comparePartialFn told us to stop early on a
  1502. * previous page, we will uselessly apply comparePartialFn
  1503. * to the first tuple on each subsequent page.
  1504. */
  1505. key->entryRes[j] =
  1506. matchPartialInPendingList(&so->ginstate,
  1507. page,
  1508. StopHigh,
  1509. pos->lastOffset,
  1510. entry,
  1511. datum,
  1512. category,
  1513. datumExtracted);
  1514. }
  1515. pos->hasMatchKey[i] |= key->entryRes[j];
  1516. }
  1517. }
  1518. /* Advance firstOffset over the scanned tuples */
  1519. pos->firstOffset = pos->lastOffset;
  1520. if (GinPageHasFullRow(page))
  1521. {
  1522. /*
  1523. * We have examined all pending entries for the current heap row.
  1524. * Break out of loop over pages.
  1525. */
  1526. break;
  1527. }
  1528. else
  1529. {
  1530. /*
  1531. * Advance to next page of pending entries for the current heap
  1532. * row. Complain if there isn't one.
  1533. */
  1534. ItemPointerData item = pos->item;
  1535. if (scanGetCandidate(scan, pos) == false ||
  1536. !ItemPointerEquals(&pos->item, &item))
  1537. elog(ERROR, "could not find additional pending pages for same heap tuple");
  1538. }
  1539. }
  1540. /*
  1541. * Now return "true" if all scan keys have at least one matching datum
  1542. */
  1543. for (i = 0; i < so->nkeys; i++)
  1544. {
  1545. if (pos->hasMatchKey[i] == false)
  1546. return false;
  1547. }
  1548. return true;
  1549. }
  1550. /*
  1551. * Collect all matched rows from pending list into bitmap.
  1552. */
  1553. static void
  1554. scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
  1555. {
  1556. GinScanOpaque so = (GinScanOpaque) scan->opaque;
  1557. MemoryContext oldCtx;
  1558. bool recheck,
  1559. match;
  1560. int i;
  1561. pendingPosition pos;
  1562. Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
  1563. Page page;
  1564. BlockNumber blkno;
  1565. *ntids = 0;
  1566. /*
  1567. * Acquire predicate lock on the metapage, to conflict with any fastupdate
  1568. * insertions.
  1569. */
  1570. PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
  1571. LockBuffer(metabuffer, GIN_SHARE);
  1572. page = BufferGetPage(metabuffer);
  1573. TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
  1574. blkno = GinPageGetMeta(page)->head;
  1575. /*
  1576. * fetch head of list before unlocking metapage. head page must be pinned
  1577. * to prevent deletion by vacuum process
  1578. */
  1579. if (blkno == InvalidBlockNumber)
  1580. {
  1581. /* No pending list, so proceed with normal scan */
  1582. UnlockReleaseBuffer(metabuffer);
  1583. return;
  1584. }
  1585. pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
  1586. LockBuffer(pos.pendingBuffer, GIN_SHARE);
  1587. pos.firstOffset = FirstOffsetNumber;
  1588. UnlockReleaseBuffer(metabuffer);
  1589. pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
  1590. /*
  1591. * loop for each heap row. scanGetCandidate returns full row or row's
  1592. * tuples from first page.
  1593. */
  1594. while (scanGetCandidate(scan, &pos))
  1595. {
  1596. /*
  1597. * Check entries in tuple and set up entryRes array.
  1598. *
  1599. * If pending tuples belonging to the current heap row are spread
  1600. * across several pages, collectMatchesForHeapRow will read all of
  1601. * those pages.
  1602. */
  1603. if (!collectMatchesForHeapRow(scan, &pos))
  1604. continue;
  1605. /*
  1606. * Matching of entries of one row is finished, so check row using
  1607. * consistent functions.
  1608. */
  1609. oldCtx = MemoryContextSwitchTo(so->tempCtx);
  1610. recheck = false;
  1611. match = true;
  1612. for (i = 0; i < so->nkeys; i++)
  1613. {
  1614. GinScanKey key = so->keys + i;
  1615. if (!key->boolConsistentFn(key))
  1616. {
  1617. match = false;
  1618. break;
  1619. }
  1620. recheck |= key->recheckCurItem;
  1621. }
  1622. MemoryContextSwitchTo(oldCtx);
  1623. MemoryContextReset(so->tempCtx);
  1624. if (match)
  1625. {
  1626. tbm_add_tuples(tbm, &pos.item, 1, recheck);
  1627. (*ntids)++;
  1628. }
  1629. }
  1630. pfree(pos.hasMatchKey);
  1631. }
  1632. #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
  1633. int64
  1634. gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
  1635. {
  1636. GinScanOpaque so = (GinScanOpaque) scan->opaque;
  1637. int64 ntids;
  1638. ItemPointerData iptr;
  1639. bool recheck;
  1640. /*
  1641. * Set up the scan keys, and check for unsatisfiable query.
  1642. */
  1643. ginFreeScanKeys(so); /* there should be no keys yet, but just to be
  1644. * sure */
  1645. ginNewScanKey(scan);
  1646. if (GinIsVoidRes(scan))
  1647. return 0;
  1648. ntids = 0;
  1649. /*
  1650. * First, scan the pending list and collect any matching entries into the
  1651. * bitmap. After we scan a pending item, some other backend could post it
  1652. * into the main index, and so we might visit it a second time during the
  1653. * main scan. This is okay because we'll just re-set the same bit in the
  1654. * bitmap. (The possibility of duplicate visits is a major reason why GIN
  1655. * can't support the amgettuple API, however.) Note that it would not do
  1656. * to scan the main index before the pending list, since concurrent
  1657. * cleanup could then make us miss entries entirely.
  1658. */
  1659. scanPendingInsert(scan, tbm, &ntids);
  1660. /*
  1661. * Now scan the main index.
  1662. */
  1663. startScan(scan);
  1664. ItemPointerSetMin(&iptr);
  1665. for (;;)
  1666. {
  1667. CHECK_FOR_INTERRUPTS();
  1668. if (!scanGetItem(scan, iptr, &iptr, &recheck))
  1669. break;
  1670. if (ItemPointerIsLossyPage(&iptr))
  1671. tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
  1672. else
  1673. tbm_add_tuples(tbm, &iptr, 1, recheck);
  1674. ntids++;
  1675. }
  1676. return ntids;
  1677. }