You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ginfast.c 28KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074
  1. /*-------------------------------------------------------------------------
  2. *
  3. * ginfast.c
  4. * Fast insert routines for the Postgres inverted index access method.
  5. * Pending entries are stored in linear list of pages. Later on
  6. * (typically during VACUUM), ginInsertCleanup() will be invoked to
  7. * transfer pending entries into the regular index structure. This
  8. * wins because bulk insertion is much more efficient than retail.
  9. *
  10. * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  11. * Portions Copyright (c) 1994, Regents of the University of California
  12. *
  13. * IDENTIFICATION
  14. * src/backend/access/gin/ginfast.c
  15. *
  16. *-------------------------------------------------------------------------
  17. */
  18. #include "postgres.h"
  19. #include "access/gin_private.h"
  20. #include "access/ginxlog.h"
  21. #include "access/xloginsert.h"
  22. #include "access/xlog.h"
  23. #include "commands/vacuum.h"
  24. #include "catalog/pg_am.h"
  25. #include "miscadmin.h"
  26. #include "utils/memutils.h"
  27. #include "utils/rel.h"
  28. #include "utils/acl.h"
  29. #include "postmaster/autovacuum.h"
  30. #include "storage/indexfsm.h"
  31. #include "storage/lmgr.h"
  32. #include "storage/predicate.h"
  33. #include "utils/builtins.h"
  34. /* GUC parameter */
  35. int gin_pending_list_limit = 0;
  36. #define GIN_PAGE_FREESIZE \
  37. ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) )
  38. typedef struct KeyArray
  39. {
  40. Datum *keys; /* expansible array */
  41. GinNullCategory *categories; /* another expansible array */
  42. int32 nvalues; /* current number of valid entries */
  43. int32 maxvalues; /* allocated size of arrays */
  44. } KeyArray;
  45. /*
  46. * Build a pending-list page from the given array of tuples, and write it out.
  47. *
  48. * Returns amount of free space left on the page.
  49. */
  50. static int32
  51. writeListPage(Relation index, Buffer buffer,
  52. IndexTuple *tuples, int32 ntuples, BlockNumber rightlink)
  53. {
  54. Page page = BufferGetPage(buffer);
  55. int32 i,
  56. freesize,
  57. size = 0;
  58. OffsetNumber l,
  59. off;
  60. PGAlignedBlock workspace;
  61. char *ptr;
  62. START_CRIT_SECTION();
  63. GinInitBuffer(buffer, GIN_LIST);
  64. off = FirstOffsetNumber;
  65. ptr = workspace.data;
  66. for (i = 0; i < ntuples; i++)
  67. {
  68. int this_size = IndexTupleSize(tuples[i]);
  69. memcpy(ptr, tuples[i], this_size);
  70. ptr += this_size;
  71. size += this_size;
  72. l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false);
  73. if (l == InvalidOffsetNumber)
  74. elog(ERROR, "failed to add item to index page in \"%s\"",
  75. RelationGetRelationName(index));
  76. off++;
  77. }
  78. Assert(size <= BLCKSZ); /* else we overran workspace */
  79. GinPageGetOpaque(page)->rightlink = rightlink;
  80. /*
  81. * tail page may contain only whole row(s) or final part of row placed on
  82. * previous pages (a "row" here meaning all the index tuples generated for
  83. * one heap tuple)
  84. */
  85. if (rightlink == InvalidBlockNumber)
  86. {
  87. GinPageSetFullRow(page);
  88. GinPageGetOpaque(page)->maxoff = 1;
  89. }
  90. else
  91. {
  92. GinPageGetOpaque(page)->maxoff = 0;
  93. }
  94. MarkBufferDirty(buffer);
  95. if (RelationNeedsWAL(index))
  96. {
  97. ginxlogInsertListPage data;
  98. XLogRecPtr recptr;
  99. data.rightlink = rightlink;
  100. data.ntuples = ntuples;
  101. XLogBeginInsert();
  102. XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage));
  103. XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
  104. XLogRegisterBufData(0, workspace.data, size);
  105. recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE);
  106. PageSetLSN(page, recptr);
  107. }
  108. /* get free space before releasing buffer */
  109. freesize = PageGetExactFreeSpace(page);
  110. UnlockReleaseBuffer(buffer);
  111. END_CRIT_SECTION();
  112. return freesize;
  113. }
  114. static void
  115. makeSublist(Relation index, IndexTuple *tuples, int32 ntuples,
  116. GinMetaPageData *res)
  117. {
  118. Buffer curBuffer = InvalidBuffer;
  119. Buffer prevBuffer = InvalidBuffer;
  120. int i,
  121. size = 0,
  122. tupsize;
  123. int startTuple = 0;
  124. Assert(ntuples > 0);
  125. /*
  126. * Split tuples into pages
  127. */
  128. for (i = 0; i < ntuples; i++)
  129. {
  130. if (curBuffer == InvalidBuffer)
  131. {
  132. curBuffer = GinNewBuffer(index);
  133. if (prevBuffer != InvalidBuffer)
  134. {
  135. res->nPendingPages++;
  136. writeListPage(index, prevBuffer,
  137. tuples + startTuple,
  138. i - startTuple,
  139. BufferGetBlockNumber(curBuffer));
  140. }
  141. else
  142. {
  143. res->head = BufferGetBlockNumber(curBuffer);
  144. }
  145. prevBuffer = curBuffer;
  146. startTuple = i;
  147. size = 0;
  148. }
  149. tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData);
  150. if (size + tupsize > GinListPageSize)
  151. {
  152. /* won't fit, force a new page and reprocess */
  153. i--;
  154. curBuffer = InvalidBuffer;
  155. }
  156. else
  157. {
  158. size += tupsize;
  159. }
  160. }
  161. /*
  162. * Write last page
  163. */
  164. res->tail = BufferGetBlockNumber(curBuffer);
  165. res->tailFreeSize = writeListPage(index, curBuffer,
  166. tuples + startTuple,
  167. ntuples - startTuple,
  168. InvalidBlockNumber);
  169. res->nPendingPages++;
  170. /* that was only one heap tuple */
  171. res->nPendingHeapTuples = 1;
  172. }
  173. /*
  174. * Write the index tuples contained in *collector into the index's
  175. * pending list.
  176. *
  177. * Function guarantees that all these tuples will be inserted consecutively,
  178. * preserving order
  179. */
  180. void
  181. ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
  182. {
  183. Relation index = ginstate->index;
  184. Buffer metabuffer;
  185. Page metapage;
  186. GinMetaPageData *metadata = NULL;
  187. Buffer buffer = InvalidBuffer;
  188. Page page = NULL;
  189. ginxlogUpdateMeta data;
  190. bool separateList = false;
  191. bool needCleanup = false;
  192. int cleanupSize;
  193. bool needWal;
  194. if (collector->ntuples == 0)
  195. return;
  196. needWal = RelationNeedsWAL(index);
  197. data.node = index->rd_node;
  198. data.ntuples = 0;
  199. data.newRightlink = data.prevTail = InvalidBlockNumber;
  200. metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
  201. metapage = BufferGetPage(metabuffer);
  202. /*
  203. * An insertion to the pending list could logically belong anywhere in the
  204. * tree, so it conflicts with all serializable scans. All scans acquire a
  205. * predicate lock on the metabuffer to represent that.
  206. */
  207. CheckForSerializableConflictIn(index, NULL, metabuffer);
  208. if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
  209. {
  210. /*
  211. * Total size is greater than one page => make sublist
  212. */
  213. separateList = true;
  214. }
  215. else
  216. {
  217. LockBuffer(metabuffer, GIN_EXCLUSIVE);
  218. metadata = GinPageGetMeta(metapage);
  219. if (metadata->head == InvalidBlockNumber ||
  220. collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize)
  221. {
  222. /*
  223. * Pending list is empty or total size is greater than freespace
  224. * on tail page => make sublist
  225. *
  226. * We unlock metabuffer to keep high concurrency
  227. */
  228. separateList = true;
  229. LockBuffer(metabuffer, GIN_UNLOCK);
  230. }
  231. }
  232. if (separateList)
  233. {
  234. /*
  235. * We should make sublist separately and append it to the tail
  236. */
  237. GinMetaPageData sublist;
  238. memset(&sublist, 0, sizeof(GinMetaPageData));
  239. makeSublist(index, collector->tuples, collector->ntuples, &sublist);
  240. if (needWal)
  241. XLogBeginInsert();
  242. /*
  243. * metapage was unlocked, see above
  244. */
  245. LockBuffer(metabuffer, GIN_EXCLUSIVE);
  246. metadata = GinPageGetMeta(metapage);
  247. if (metadata->head == InvalidBlockNumber)
  248. {
  249. /*
  250. * Main list is empty, so just insert sublist as main list
  251. */
  252. START_CRIT_SECTION();
  253. metadata->head = sublist.head;
  254. metadata->tail = sublist.tail;
  255. metadata->tailFreeSize = sublist.tailFreeSize;
  256. metadata->nPendingPages = sublist.nPendingPages;
  257. metadata->nPendingHeapTuples = sublist.nPendingHeapTuples;
  258. }
  259. else
  260. {
  261. /*
  262. * Merge lists
  263. */
  264. data.prevTail = metadata->tail;
  265. data.newRightlink = sublist.head;
  266. buffer = ReadBuffer(index, metadata->tail);
  267. LockBuffer(buffer, GIN_EXCLUSIVE);
  268. page = BufferGetPage(buffer);
  269. Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber);
  270. START_CRIT_SECTION();
  271. GinPageGetOpaque(page)->rightlink = sublist.head;
  272. MarkBufferDirty(buffer);
  273. metadata->tail = sublist.tail;
  274. metadata->tailFreeSize = sublist.tailFreeSize;
  275. metadata->nPendingPages += sublist.nPendingPages;
  276. metadata->nPendingHeapTuples += sublist.nPendingHeapTuples;
  277. if (needWal)
  278. XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
  279. }
  280. }
  281. else
  282. {
  283. /*
  284. * Insert into tail page. Metapage is already locked
  285. */
  286. OffsetNumber l,
  287. off;
  288. int i,
  289. tupsize;
  290. char *ptr;
  291. char *collectordata;
  292. buffer = ReadBuffer(index, metadata->tail);
  293. LockBuffer(buffer, GIN_EXCLUSIVE);
  294. page = BufferGetPage(buffer);
  295. off = (PageIsEmpty(page)) ? FirstOffsetNumber :
  296. OffsetNumberNext(PageGetMaxOffsetNumber(page));
  297. collectordata = ptr = (char *) palloc(collector->sumsize);
  298. data.ntuples = collector->ntuples;
  299. if (needWal)
  300. XLogBeginInsert();
  301. START_CRIT_SECTION();
  302. /*
  303. * Increase counter of heap tuples
  304. */
  305. Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples);
  306. GinPageGetOpaque(page)->maxoff++;
  307. metadata->nPendingHeapTuples++;
  308. for (i = 0; i < collector->ntuples; i++)
  309. {
  310. tupsize = IndexTupleSize(collector->tuples[i]);
  311. l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false);
  312. if (l == InvalidOffsetNumber)
  313. elog(ERROR, "failed to add item to index page in \"%s\"",
  314. RelationGetRelationName(index));
  315. memcpy(ptr, collector->tuples[i], tupsize);
  316. ptr += tupsize;
  317. off++;
  318. }
  319. Assert((ptr - collectordata) <= collector->sumsize);
  320. if (needWal)
  321. {
  322. XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
  323. XLogRegisterBufData(1, collectordata, collector->sumsize);
  324. }
  325. metadata->tailFreeSize = PageGetExactFreeSpace(page);
  326. MarkBufferDirty(buffer);
  327. }
  328. /*
  329. * Set pd_lower just past the end of the metadata. This is essential,
  330. * because without doing so, metadata will be lost if xlog.c compresses
  331. * the page. (We must do this here because pre-v11 versions of PG did not
  332. * set the metapage's pd_lower correctly, so a pg_upgraded index might
  333. * contain the wrong value.)
  334. */
  335. ((PageHeader) metapage)->pd_lower =
  336. ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;
  337. /*
  338. * Write metabuffer, make xlog entry
  339. */
  340. MarkBufferDirty(metabuffer);
  341. if (needWal)
  342. {
  343. XLogRecPtr recptr;
  344. memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
  345. XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
  346. XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta));
  347. recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE);
  348. PageSetLSN(metapage, recptr);
  349. if (buffer != InvalidBuffer)
  350. {
  351. PageSetLSN(page, recptr);
  352. }
  353. }
  354. if (buffer != InvalidBuffer)
  355. UnlockReleaseBuffer(buffer);
  356. /*
  357. * Force pending list cleanup when it becomes too long. And,
  358. * ginInsertCleanup could take significant amount of time, so we prefer to
  359. * call it when it can do all the work in a single collection cycle. In
  360. * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it
  361. * while pending list is still small enough to fit into
  362. * gin_pending_list_limit.
  363. *
  364. * ginInsertCleanup() should not be called inside our CRIT_SECTION.
  365. */
  366. cleanupSize = GinGetPendingListCleanupSize(index);
  367. if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L)
  368. needCleanup = true;
  369. UnlockReleaseBuffer(metabuffer);
  370. END_CRIT_SECTION();
  371. /*
  372. * Since it could contend with concurrent cleanup process we cleanup
  373. * pending list not forcibly.
  374. */
  375. if (needCleanup)
  376. ginInsertCleanup(ginstate, false, true, false, NULL);
  377. }
  378. /*
  379. * Create temporary index tuples for a single indexable item (one index column
  380. * for the heap tuple specified by ht_ctid), and append them to the array
  381. * in *collector. They will subsequently be written out using
  382. * ginHeapTupleFastInsert. Note that to guarantee consistent state, all
  383. * temp tuples for a given heap tuple must be written in one call to
  384. * ginHeapTupleFastInsert.
  385. */
  386. void
  387. ginHeapTupleFastCollect(GinState *ginstate,
  388. GinTupleCollector *collector,
  389. OffsetNumber attnum, Datum value, bool isNull,
  390. ItemPointer ht_ctid)
  391. {
  392. Datum *entries;
  393. GinNullCategory *categories;
  394. int32 i,
  395. nentries;
  396. /*
  397. * Extract the key values that need to be inserted in the index
  398. */
  399. entries = ginExtractEntries(ginstate, attnum, value, isNull,
  400. &nentries, &categories);
  401. /*
  402. * Protect against integer overflow in allocation calculations
  403. */
  404. if (nentries < 0 ||
  405. collector->ntuples + nentries > MaxAllocSize / sizeof(IndexTuple))
  406. elog(ERROR, "too many entries for GIN index");
  407. /*
  408. * Allocate/reallocate memory for storing collected tuples
  409. */
  410. if (collector->tuples == NULL)
  411. {
  412. /*
  413. * Determine the number of elements to allocate in the tuples array
  414. * initially. Make it a power of 2 to avoid wasting memory when
  415. * resizing (since palloc likes powers of 2).
  416. */
  417. collector->lentuples = 16;
  418. while (collector->lentuples < nentries)
  419. collector->lentuples *= 2;
  420. collector->tuples = (IndexTuple *) palloc(sizeof(IndexTuple) * collector->lentuples);
  421. }
  422. else if (collector->lentuples < collector->ntuples + nentries)
  423. {
  424. /*
  425. * Advance lentuples to the next suitable power of 2. This won't
  426. * overflow, though we could get to a value that exceeds
  427. * MaxAllocSize/sizeof(IndexTuple), causing an error in repalloc.
  428. */
  429. do
  430. {
  431. collector->lentuples *= 2;
  432. } while (collector->lentuples < collector->ntuples + nentries);
  433. collector->tuples = (IndexTuple *) repalloc(collector->tuples,
  434. sizeof(IndexTuple) * collector->lentuples);
  435. }
  436. /*
  437. * Build an index tuple for each key value, and add to array. In pending
  438. * tuples we just stick the heap TID into t_tid.
  439. */
  440. for (i = 0; i < nentries; i++)
  441. {
  442. IndexTuple itup;
  443. itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
  444. NULL, 0, 0, true);
  445. itup->t_tid = *ht_ctid;
  446. collector->tuples[collector->ntuples++] = itup;
  447. collector->sumsize += IndexTupleSize(itup);
  448. }
  449. }
  450. /*
  451. * Deletes pending list pages up to (not including) newHead page.
  452. * If newHead == InvalidBlockNumber then function drops the whole list.
  453. *
  454. * metapage is pinned and exclusive-locked throughout this function.
  455. */
  456. static void
  457. shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
  458. bool fill_fsm, IndexBulkDeleteResult *stats)
  459. {
  460. Page metapage;
  461. GinMetaPageData *metadata;
  462. BlockNumber blknoToDelete;
  463. metapage = BufferGetPage(metabuffer);
  464. metadata = GinPageGetMeta(metapage);
  465. blknoToDelete = metadata->head;
  466. do
  467. {
  468. Page page;
  469. int i;
  470. int64 nDeletedHeapTuples = 0;
  471. ginxlogDeleteListPages data;
  472. Buffer buffers[GIN_NDELETE_AT_ONCE];
  473. BlockNumber freespace[GIN_NDELETE_AT_ONCE];
  474. data.ndeleted = 0;
  475. while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead)
  476. {
  477. freespace[data.ndeleted] = blknoToDelete;
  478. buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete);
  479. LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE);
  480. page = BufferGetPage(buffers[data.ndeleted]);
  481. data.ndeleted++;
  482. Assert(!GinPageIsDeleted(page));
  483. nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff;
  484. blknoToDelete = GinPageGetOpaque(page)->rightlink;
  485. }
  486. if (stats)
  487. stats->pages_deleted += data.ndeleted;
  488. /*
  489. * This operation touches an unusually large number of pages, so
  490. * prepare the XLogInsert machinery for that before entering the
  491. * critical section.
  492. */
  493. if (RelationNeedsWAL(index))
  494. XLogEnsureRecordSpace(data.ndeleted, 0);
  495. START_CRIT_SECTION();
  496. metadata->head = blknoToDelete;
  497. Assert(metadata->nPendingPages >= data.ndeleted);
  498. metadata->nPendingPages -= data.ndeleted;
  499. Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples);
  500. metadata->nPendingHeapTuples -= nDeletedHeapTuples;
  501. if (blknoToDelete == InvalidBlockNumber)
  502. {
  503. metadata->tail = InvalidBlockNumber;
  504. metadata->tailFreeSize = 0;
  505. metadata->nPendingPages = 0;
  506. metadata->nPendingHeapTuples = 0;
  507. }
  508. /*
  509. * Set pd_lower just past the end of the metadata. This is essential,
  510. * because without doing so, metadata will be lost if xlog.c
  511. * compresses the page. (We must do this here because pre-v11
  512. * versions of PG did not set the metapage's pd_lower correctly, so a
  513. * pg_upgraded index might contain the wrong value.)
  514. */
  515. ((PageHeader) metapage)->pd_lower =
  516. ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;
  517. MarkBufferDirty(metabuffer);
  518. for (i = 0; i < data.ndeleted; i++)
  519. {
  520. page = BufferGetPage(buffers[i]);
  521. GinPageGetOpaque(page)->flags = GIN_DELETED;
  522. MarkBufferDirty(buffers[i]);
  523. }
  524. if (RelationNeedsWAL(index))
  525. {
  526. XLogRecPtr recptr;
  527. XLogBeginInsert();
  528. XLogRegisterBuffer(0, metabuffer,
  529. REGBUF_WILL_INIT | REGBUF_STANDARD);
  530. for (i = 0; i < data.ndeleted; i++)
  531. XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT);
  532. memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
  533. XLogRegisterData((char *) &data,
  534. sizeof(ginxlogDeleteListPages));
  535. recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE);
  536. PageSetLSN(metapage, recptr);
  537. for (i = 0; i < data.ndeleted; i++)
  538. {
  539. page = BufferGetPage(buffers[i]);
  540. PageSetLSN(page, recptr);
  541. }
  542. }
  543. for (i = 0; i < data.ndeleted; i++)
  544. UnlockReleaseBuffer(buffers[i]);
  545. END_CRIT_SECTION();
  546. for (i = 0; fill_fsm && i < data.ndeleted; i++)
  547. RecordFreeIndexPage(index, freespace[i]);
  548. } while (blknoToDelete != newHead);
  549. }
  550. /* Initialize empty KeyArray */
  551. static void
  552. initKeyArray(KeyArray *keys, int32 maxvalues)
  553. {
  554. keys->keys = (Datum *) palloc(sizeof(Datum) * maxvalues);
  555. keys->categories = (GinNullCategory *)
  556. palloc(sizeof(GinNullCategory) * maxvalues);
  557. keys->nvalues = 0;
  558. keys->maxvalues = maxvalues;
  559. }
  560. /* Add datum to KeyArray, resizing if needed */
  561. static void
  562. addDatum(KeyArray *keys, Datum datum, GinNullCategory category)
  563. {
  564. if (keys->nvalues >= keys->maxvalues)
  565. {
  566. keys->maxvalues *= 2;
  567. keys->keys = (Datum *)
  568. repalloc(keys->keys, sizeof(Datum) * keys->maxvalues);
  569. keys->categories = (GinNullCategory *)
  570. repalloc(keys->categories, sizeof(GinNullCategory) * keys->maxvalues);
  571. }
  572. keys->keys[keys->nvalues] = datum;
  573. keys->categories[keys->nvalues] = category;
  574. keys->nvalues++;
  575. }
  576. /*
  577. * Collect data from a pending-list page in preparation for insertion into
  578. * the main index.
  579. *
  580. * Go through all tuples >= startoff on page and collect values in accum
  581. *
  582. * Note that ka is just workspace --- it does not carry any state across
  583. * calls.
  584. */
  585. static void
  586. processPendingPage(BuildAccumulator *accum, KeyArray *ka,
  587. Page page, OffsetNumber startoff)
  588. {
  589. ItemPointerData heapptr;
  590. OffsetNumber i,
  591. maxoff;
  592. OffsetNumber attrnum;
  593. /* reset *ka to empty */
  594. ka->nvalues = 0;
  595. maxoff = PageGetMaxOffsetNumber(page);
  596. Assert(maxoff >= FirstOffsetNumber);
  597. ItemPointerSetInvalid(&heapptr);
  598. attrnum = 0;
  599. for (i = startoff; i <= maxoff; i = OffsetNumberNext(i))
  600. {
  601. IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
  602. OffsetNumber curattnum;
  603. Datum curkey;
  604. GinNullCategory curcategory;
  605. /* Check for change of heap TID or attnum */
  606. curattnum = gintuple_get_attrnum(accum->ginstate, itup);
  607. if (!ItemPointerIsValid(&heapptr))
  608. {
  609. heapptr = itup->t_tid;
  610. attrnum = curattnum;
  611. }
  612. else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) &&
  613. curattnum == attrnum))
  614. {
  615. /*
  616. * ginInsertBAEntries can insert several datums per call, but only
  617. * for one heap tuple and one column. So call it at a boundary,
  618. * and reset ka.
  619. */
  620. ginInsertBAEntries(accum, &heapptr, attrnum,
  621. ka->keys, ka->categories, ka->nvalues);
  622. ka->nvalues = 0;
  623. heapptr = itup->t_tid;
  624. attrnum = curattnum;
  625. }
  626. /* Add key to KeyArray */
  627. curkey = gintuple_get_key(accum->ginstate, itup, &curcategory);
  628. addDatum(ka, curkey, curcategory);
  629. }
  630. /* Dump out all remaining keys */
  631. ginInsertBAEntries(accum, &heapptr, attrnum,
  632. ka->keys, ka->categories, ka->nvalues);
  633. }
  634. /*
  635. * Move tuples from pending pages into regular GIN structure.
  636. *
  637. * On first glance it looks completely not crash-safe. But if we crash
  638. * after posting entries to the main index and before removing them from the
  639. * pending list, it's okay because when we redo the posting later on, nothing
  640. * bad will happen.
  641. *
  642. * fill_fsm indicates that ginInsertCleanup should add deleted pages
  643. * to FSM otherwise caller is responsible to put deleted pages into
  644. * FSM.
  645. *
  646. * If stats isn't null, we count deleted pending pages into the counts.
  647. */
  648. void
  649. ginInsertCleanup(GinState *ginstate, bool full_clean,
  650. bool fill_fsm, bool forceCleanup,
  651. IndexBulkDeleteResult *stats)
  652. {
  653. Relation index = ginstate->index;
  654. Buffer metabuffer,
  655. buffer;
  656. Page metapage,
  657. page;
  658. GinMetaPageData *metadata;
  659. MemoryContext opCtx,
  660. oldCtx;
  661. BuildAccumulator accum;
  662. KeyArray datums;
  663. BlockNumber blkno,
  664. blknoFinish;
  665. bool cleanupFinish = false;
  666. bool fsm_vac = false;
  667. Size workMemory;
  668. /*
  669. * We would like to prevent concurrent cleanup process. For that we will
  670. * lock metapage in exclusive mode using LockPage() call. Nobody other
  671. * will use that lock for metapage, so we keep possibility of concurrent
  672. * insertion into pending list
  673. */
  674. if (forceCleanup)
  675. {
  676. /*
  677. * We are called from [auto]vacuum/analyze or gin_clean_pending_list()
  678. * and we would like to wait concurrent cleanup to finish.
  679. */
  680. LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
  681. workMemory =
  682. (IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ?
  683. autovacuum_work_mem : maintenance_work_mem;
  684. }
  685. else
  686. {
  687. /*
  688. * We are called from regular insert and if we see concurrent cleanup
  689. * just exit in hope that concurrent process will clean up pending
  690. * list.
  691. */
  692. if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock))
  693. return;
  694. workMemory = work_mem;
  695. }
  696. metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
  697. LockBuffer(metabuffer, GIN_SHARE);
  698. metapage = BufferGetPage(metabuffer);
  699. metadata = GinPageGetMeta(metapage);
  700. if (metadata->head == InvalidBlockNumber)
  701. {
  702. /* Nothing to do */
  703. UnlockReleaseBuffer(metabuffer);
  704. UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
  705. return;
  706. }
  707. /*
  708. * Remember a tail page to prevent infinite cleanup if other backends add
  709. * new tuples faster than we can cleanup.
  710. */
  711. blknoFinish = metadata->tail;
  712. /*
  713. * Read and lock head of pending list
  714. */
  715. blkno = metadata->head;
  716. buffer = ReadBuffer(index, blkno);
  717. LockBuffer(buffer, GIN_SHARE);
  718. page = BufferGetPage(buffer);
  719. LockBuffer(metabuffer, GIN_UNLOCK);
  720. /*
  721. * Initialize. All temporary space will be in opCtx
  722. */
  723. opCtx = AllocSetContextCreate(CurrentMemoryContext,
  724. "GIN insert cleanup temporary context",
  725. ALLOCSET_DEFAULT_SIZES);
  726. oldCtx = MemoryContextSwitchTo(opCtx);
  727. initKeyArray(&datums, 128);
  728. ginInitBA(&accum);
  729. accum.ginstate = ginstate;
  730. /*
  731. * At the top of this loop, we have pin and lock on the current page of
  732. * the pending list. However, we'll release that before exiting the loop.
  733. * Note we also have pin but not lock on the metapage.
  734. */
  735. for (;;)
  736. {
  737. Assert(!GinPageIsDeleted(page));
  738. /*
  739. * Are we walk through the page which as we remember was a tail when
  740. * we start our cleanup? But if caller asks us to clean up whole
  741. * pending list then ignore old tail, we will work until list becomes
  742. * empty.
  743. */
  744. if (blkno == blknoFinish && full_clean == false)
  745. cleanupFinish = true;
  746. /*
  747. * read page's datums into accum
  748. */
  749. processPendingPage(&accum, &datums, page, FirstOffsetNumber);
  750. vacuum_delay_point();
  751. /*
  752. * Is it time to flush memory to disk? Flush if we are at the end of
  753. * the pending list, or if we have a full row and memory is getting
  754. * full.
  755. */
  756. if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber ||
  757. (GinPageHasFullRow(page) &&
  758. (accum.allocatedMemory >= workMemory * 1024L)))
  759. {
  760. ItemPointerData *list;
  761. uint32 nlist;
  762. Datum key;
  763. GinNullCategory category;
  764. OffsetNumber maxoff,
  765. attnum;
  766. /*
  767. * Unlock current page to increase performance. Changes of page
  768. * will be checked later by comparing maxoff after completion of
  769. * memory flush.
  770. */
  771. maxoff = PageGetMaxOffsetNumber(page);
  772. LockBuffer(buffer, GIN_UNLOCK);
  773. /*
  774. * Moving collected data into regular structure can take
  775. * significant amount of time - so, run it without locking pending
  776. * list.
  777. */
  778. ginBeginBAScan(&accum);
  779. while ((list = ginGetBAEntry(&accum,
  780. &attnum, &key, &category, &nlist)) != NULL)
  781. {
  782. ginEntryInsert(ginstate, attnum, key, category,
  783. list, nlist, NULL);
  784. vacuum_delay_point();
  785. }
  786. /*
  787. * Lock the whole list to remove pages
  788. */
  789. LockBuffer(metabuffer, GIN_EXCLUSIVE);
  790. LockBuffer(buffer, GIN_SHARE);
  791. Assert(!GinPageIsDeleted(page));
  792. /*
  793. * While we left the page unlocked, more stuff might have gotten
  794. * added to it. If so, process those entries immediately. There
  795. * shouldn't be very many, so we don't worry about the fact that
  796. * we're doing this with exclusive lock. Insertion algorithm
  797. * guarantees that inserted row(s) will not continue on next page.
  798. * NOTE: intentionally no vacuum_delay_point in this loop.
  799. */
  800. if (PageGetMaxOffsetNumber(page) != maxoff)
  801. {
  802. ginInitBA(&accum);
  803. processPendingPage(&accum, &datums, page, maxoff + 1);
  804. ginBeginBAScan(&accum);
  805. while ((list = ginGetBAEntry(&accum,
  806. &attnum, &key, &category, &nlist)) != NULL)
  807. ginEntryInsert(ginstate, attnum, key, category,
  808. list, nlist, NULL);
  809. }
  810. /*
  811. * Remember next page - it will become the new list head
  812. */
  813. blkno = GinPageGetOpaque(page)->rightlink;
  814. UnlockReleaseBuffer(buffer); /* shiftList will do exclusive
  815. * locking */
  816. /*
  817. * remove read pages from pending list, at this point all content
  818. * of read pages is in regular structure
  819. */
  820. shiftList(index, metabuffer, blkno, fill_fsm, stats);
  821. /* At this point, some pending pages have been freed up */
  822. fsm_vac = true;
  823. Assert(blkno == metadata->head);
  824. LockBuffer(metabuffer, GIN_UNLOCK);
  825. /*
  826. * if we removed the whole pending list or we cleanup tail (which
  827. * we remembered on start our cleanup process) then just exit
  828. */
  829. if (blkno == InvalidBlockNumber || cleanupFinish)
  830. break;
  831. /*
  832. * release memory used so far and reinit state
  833. */
  834. MemoryContextReset(opCtx);
  835. initKeyArray(&datums, datums.maxvalues);
  836. ginInitBA(&accum);
  837. }
  838. else
  839. {
  840. blkno = GinPageGetOpaque(page)->rightlink;
  841. UnlockReleaseBuffer(buffer);
  842. }
  843. /*
  844. * Read next page in pending list
  845. */
  846. vacuum_delay_point();
  847. buffer = ReadBuffer(index, blkno);
  848. LockBuffer(buffer, GIN_SHARE);
  849. page = BufferGetPage(buffer);
  850. }
  851. UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
  852. ReleaseBuffer(metabuffer);
  853. /*
  854. * As pending list pages can have a high churn rate, it is desirable to
  855. * recycle them immediately to the FreeSpace Map when ordinary backends
  856. * clean the list.
  857. */
  858. if (fsm_vac && fill_fsm)
  859. IndexFreeSpaceMapVacuum(index);
  860. /* Clean up temporary space */
  861. MemoryContextSwitchTo(oldCtx);
  862. MemoryContextDelete(opCtx);
  863. }
  864. /*
  865. * SQL-callable function to clean the insert pending list
  866. */
  867. Datum
  868. gin_clean_pending_list(PG_FUNCTION_ARGS)
  869. {
  870. Oid indexoid = PG_GETARG_OID(0);
  871. Relation indexRel = index_open(indexoid, AccessShareLock);
  872. IndexBulkDeleteResult stats;
  873. GinState ginstate;
  874. if (RecoveryInProgress())
  875. ereport(ERROR,
  876. (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
  877. errmsg("recovery is in progress"),
  878. errhint("GIN pending list cannot be cleaned up during recovery.")));
  879. /* Must be a GIN index */
  880. if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
  881. indexRel->rd_rel->relam != GIN_AM_OID)
  882. ereport(ERROR,
  883. (errcode(ERRCODE_WRONG_OBJECT_TYPE),
  884. errmsg("\"%s\" is not a GIN index",
  885. RelationGetRelationName(indexRel))));
  886. /*
  887. * Reject attempts to read non-local temporary relations; we would be
  888. * likely to get wrong data since we have no visibility into the owning
  889. * session's local buffers.
  890. */
  891. if (RELATION_IS_OTHER_TEMP(indexRel))
  892. ereport(ERROR,
  893. (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
  894. errmsg("cannot access temporary indexes of other sessions")));
  895. /* User must own the index (comparable to privileges needed for VACUUM) */
  896. if (!pg_class_ownercheck(indexoid, GetUserId()))
  897. aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
  898. RelationGetRelationName(indexRel));
  899. memset(&stats, 0, sizeof(stats));
  900. initGinState(&ginstate, indexRel);
  901. ginInsertCleanup(&ginstate, true, true, true, &stats);
  902. index_close(indexRel, AccessShareLock);
  903. PG_RETURN_INT64((int64) stats.pages_deleted);
  904. }