You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

brin.c 42KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495
  1. /*
  2. * brin.c
  3. * Implementation of BRIN indexes for Postgres
  4. *
  5. * See src/backend/access/brin/README for details.
  6. *
  7. * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  8. * Portions Copyright (c) 1994, Regents of the University of California
  9. *
  10. * IDENTIFICATION
  11. * src/backend/access/brin/brin.c
  12. *
  13. * TODO
  14. * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
  15. */
  16. #include "postgres.h"
  17. #include "access/brin.h"
  18. #include "access/brin_page.h"
  19. #include "access/brin_pageops.h"
  20. #include "access/brin_xlog.h"
  21. #include "access/reloptions.h"
  22. #include "access/relscan.h"
  23. #include "access/xloginsert.h"
  24. #include "catalog/index.h"
  25. #include "catalog/pg_am.h"
  26. #include "miscadmin.h"
  27. #include "pgstat.h"
  28. #include "postmaster/autovacuum.h"
  29. #include "storage/bufmgr.h"
  30. #include "storage/freespace.h"
  31. #include "utils/builtins.h"
  32. #include "utils/index_selfuncs.h"
  33. #include "utils/memutils.h"
  34. #include "utils/rel.h"
  35. /*
  36. * We use a BrinBuildState during initial construction of a BRIN index.
  37. * The running state is kept in a BrinMemTuple.
  38. */
  39. typedef struct BrinBuildState
  40. {
  41. Relation bs_irel;
  42. int bs_numtuples;
  43. Buffer bs_currentInsertBuf;
  44. BlockNumber bs_pagesPerRange;
  45. BlockNumber bs_currRangeStart;
  46. BrinRevmap *bs_rmAccess;
  47. BrinDesc *bs_bdesc;
  48. BrinMemTuple *bs_dtuple;
  49. } BrinBuildState;
  50. /*
  51. * Struct used as "opaque" during index scans
  52. */
  53. typedef struct BrinOpaque
  54. {
  55. BlockNumber bo_pagesPerRange;
  56. BrinRevmap *bo_rmAccess;
  57. BrinDesc *bo_bdesc;
  58. } BrinOpaque;
  59. #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
  60. static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
  61. BrinRevmap *revmap, BlockNumber pagesPerRange);
  62. static void terminate_brin_buildstate(BrinBuildState *state);
  63. static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
  64. bool include_partial, double *numSummarized, double *numExisting);
  65. static void form_and_insert_tuple(BrinBuildState *state);
  66. static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
  67. BrinTuple *b);
  68. static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
  69. /*
  70. * BRIN handler function: return IndexAmRoutine with access method parameters
  71. * and callbacks.
  72. */
  73. Datum
  74. brinhandler(PG_FUNCTION_ARGS)
  75. {
  76. IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
  77. amroutine->amstrategies = 0;
  78. amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
  79. amroutine->amcanorder = false;
  80. amroutine->amcanorderbyop = false;
  81. amroutine->amcanbackward = false;
  82. amroutine->amcanunique = false;
  83. amroutine->amcanmulticol = true;
  84. amroutine->amoptionalkey = true;
  85. amroutine->amsearcharray = false;
  86. amroutine->amsearchnulls = true;
  87. amroutine->amstorage = true;
  88. amroutine->amclusterable = false;
  89. amroutine->ampredlocks = false;
  90. amroutine->amcanparallel = false;
  91. amroutine->amcaninclude = false;
  92. amroutine->amkeytype = InvalidOid;
  93. amroutine->ambuild = brinbuild;
  94. amroutine->ambuildempty = brinbuildempty;
  95. amroutine->aminsert = brininsert;
  96. amroutine->ambulkdelete = brinbulkdelete;
  97. amroutine->amvacuumcleanup = brinvacuumcleanup;
  98. amroutine->amcanreturn = NULL;
  99. amroutine->amcostestimate = brincostestimate;
  100. amroutine->amoptions = brinoptions;
  101. amroutine->amproperty = NULL;
  102. amroutine->amvalidate = brinvalidate;
  103. amroutine->ambeginscan = brinbeginscan;
  104. amroutine->amrescan = brinrescan;
  105. amroutine->amgettuple = NULL;
  106. amroutine->amgetbitmap = bringetbitmap;
  107. amroutine->amendscan = brinendscan;
  108. amroutine->ammarkpos = NULL;
  109. amroutine->amrestrpos = NULL;
  110. amroutine->amestimateparallelscan = NULL;
  111. amroutine->aminitparallelscan = NULL;
  112. amroutine->amparallelrescan = NULL;
  113. PG_RETURN_POINTER(amroutine);
  114. }
  115. /*
  116. * A tuple in the heap is being inserted. To keep a brin index up to date,
  117. * we need to obtain the relevant index tuple and compare its stored values
  118. * with those of the new tuple. If the tuple values are not consistent with
  119. * the summary tuple, we need to update the index tuple.
  120. *
  121. * If autosummarization is enabled, check if we need to summarize the previous
  122. * page range.
  123. *
  124. * If the range is not currently summarized (i.e. the revmap returns NULL for
  125. * it), there's nothing to do for this tuple.
  126. */
  127. bool
  128. brininsert(Relation idxRel, Datum *values, bool *nulls,
  129. ItemPointer heaptid, Relation heapRel,
  130. IndexUniqueCheck checkUnique,
  131. IndexInfo *indexInfo)
  132. {
  133. BlockNumber pagesPerRange;
  134. BlockNumber origHeapBlk;
  135. BlockNumber heapBlk;
  136. BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
  137. BrinRevmap *revmap;
  138. Buffer buf = InvalidBuffer;
  139. MemoryContext tupcxt = NULL;
  140. MemoryContext oldcxt = CurrentMemoryContext;
  141. bool autosummarize = BrinGetAutoSummarize(idxRel);
  142. revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
  143. /*
  144. * origHeapBlk is the block number where the insertion occurred. heapBlk
  145. * is the first block in the corresponding page range.
  146. */
  147. origHeapBlk = ItemPointerGetBlockNumber(heaptid);
  148. heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
  149. for (;;)
  150. {
  151. bool need_insert = false;
  152. OffsetNumber off;
  153. BrinTuple *brtup;
  154. BrinMemTuple *dtup;
  155. int keyno;
  156. CHECK_FOR_INTERRUPTS();
  157. /*
  158. * If auto-summarization is enabled and we just inserted the first
  159. * tuple into the first block of a new non-first page range, request a
  160. * summarization run of the previous range.
  161. */
  162. if (autosummarize &&
  163. heapBlk > 0 &&
  164. heapBlk == origHeapBlk &&
  165. ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
  166. {
  167. BlockNumber lastPageRange = heapBlk - 1;
  168. BrinTuple *lastPageTuple;
  169. lastPageTuple =
  170. brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
  171. NULL, BUFFER_LOCK_SHARE, NULL);
  172. if (!lastPageTuple)
  173. {
  174. bool recorded;
  175. recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
  176. RelationGetRelid(idxRel),
  177. lastPageRange);
  178. if (!recorded)
  179. ereport(LOG,
  180. (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  181. errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
  182. RelationGetRelationName(idxRel),
  183. lastPageRange)));
  184. }
  185. else
  186. LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  187. }
  188. brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
  189. NULL, BUFFER_LOCK_SHARE, NULL);
  190. /* if range is unsummarized, there's nothing to do */
  191. if (!brtup)
  192. break;
  193. /* First time through in this statement? */
  194. if (bdesc == NULL)
  195. {
  196. MemoryContextSwitchTo(indexInfo->ii_Context);
  197. bdesc = brin_build_desc(idxRel);
  198. indexInfo->ii_AmCache = (void *) bdesc;
  199. MemoryContextSwitchTo(oldcxt);
  200. }
  201. /* First time through in this brininsert call? */
  202. if (tupcxt == NULL)
  203. {
  204. tupcxt = AllocSetContextCreate(CurrentMemoryContext,
  205. "brininsert cxt",
  206. ALLOCSET_DEFAULT_SIZES);
  207. MemoryContextSwitchTo(tupcxt);
  208. }
  209. dtup = brin_deform_tuple(bdesc, brtup, NULL);
  210. /*
  211. * Compare the key values of the new tuple to the stored index values;
  212. * our deformed tuple will get updated if the new tuple doesn't fit
  213. * the original range (note this means we can't break out of the loop
  214. * early). Make a note of whether this happens, so that we know to
  215. * insert the modified tuple later.
  216. */
  217. for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
  218. {
  219. Datum result;
  220. BrinValues *bval;
  221. FmgrInfo *addValue;
  222. bval = &dtup->bt_columns[keyno];
  223. addValue = index_getprocinfo(idxRel, keyno + 1,
  224. BRIN_PROCNUM_ADDVALUE);
  225. result = FunctionCall4Coll(addValue,
  226. idxRel->rd_indcollation[keyno],
  227. PointerGetDatum(bdesc),
  228. PointerGetDatum(bval),
  229. values[keyno],
  230. nulls[keyno]);
  231. /* if that returned true, we need to insert the updated tuple */
  232. need_insert |= DatumGetBool(result);
  233. }
  234. if (!need_insert)
  235. {
  236. /*
  237. * The tuple is consistent with the new values, so there's nothing
  238. * to do.
  239. */
  240. LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  241. }
  242. else
  243. {
  244. Page page = BufferGetPage(buf);
  245. ItemId lp = PageGetItemId(page, off);
  246. Size origsz;
  247. BrinTuple *origtup;
  248. Size newsz;
  249. BrinTuple *newtup;
  250. bool samepage;
  251. /*
  252. * Make a copy of the old tuple, so that we can compare it after
  253. * re-acquiring the lock.
  254. */
  255. origsz = ItemIdGetLength(lp);
  256. origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
  257. /*
  258. * Before releasing the lock, check if we can attempt a same-page
  259. * update. Another process could insert a tuple concurrently in
  260. * the same page though, so downstream we must be prepared to cope
  261. * if this turns out to not be possible after all.
  262. */
  263. newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
  264. samepage = brin_can_do_samepage_update(buf, origsz, newsz);
  265. LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  266. /*
  267. * Try to update the tuple. If this doesn't work for whatever
  268. * reason, we need to restart from the top; the revmap might be
  269. * pointing at a different tuple for this block now, so we need to
  270. * recompute to ensure both our new heap tuple and the other
  271. * inserter's are covered by the combined tuple. It might be that
  272. * we don't need to update at all.
  273. */
  274. if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
  275. buf, off, origtup, origsz, newtup, newsz,
  276. samepage))
  277. {
  278. /* no luck; start over */
  279. MemoryContextResetAndDeleteChildren(tupcxt);
  280. continue;
  281. }
  282. }
  283. /* success! */
  284. break;
  285. }
  286. brinRevmapTerminate(revmap);
  287. if (BufferIsValid(buf))
  288. ReleaseBuffer(buf);
  289. MemoryContextSwitchTo(oldcxt);
  290. if (tupcxt != NULL)
  291. MemoryContextDelete(tupcxt);
  292. return false;
  293. }
  294. /*
  295. * Initialize state for a BRIN index scan.
  296. *
  297. * We read the metapage here to determine the pages-per-range number that this
  298. * index was built with. Note that since this cannot be changed while we're
  299. * holding lock on index, it's not necessary to recompute it during brinrescan.
  300. */
  301. IndexScanDesc
  302. brinbeginscan(Relation r, int nkeys, int norderbys)
  303. {
  304. IndexScanDesc scan;
  305. BrinOpaque *opaque;
  306. scan = RelationGetIndexScan(r, nkeys, norderbys);
  307. opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
  308. opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange,
  309. scan->xs_snapshot);
  310. opaque->bo_bdesc = brin_build_desc(r);
  311. scan->opaque = opaque;
  312. return scan;
  313. }
  314. /*
  315. * Execute the index scan.
  316. *
  317. * This works by reading index TIDs from the revmap, and obtaining the index
  318. * tuples pointed to by them; the summary values in the index tuples are
  319. * compared to the scan keys. We return into the TID bitmap all the pages in
  320. * ranges corresponding to index tuples that match the scan keys.
  321. *
  322. * If a TID from the revmap is read as InvalidTID, we know that range is
  323. * unsummarized. Pages in those ranges need to be returned regardless of scan
  324. * keys.
  325. */
  326. int64
  327. bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
  328. {
  329. Relation idxRel = scan->indexRelation;
  330. Buffer buf = InvalidBuffer;
  331. BrinDesc *bdesc;
  332. Oid heapOid;
  333. Relation heapRel;
  334. BrinOpaque *opaque;
  335. BlockNumber nblocks;
  336. BlockNumber heapBlk;
  337. int totalpages = 0;
  338. FmgrInfo *consistentFn;
  339. MemoryContext oldcxt;
  340. MemoryContext perRangeCxt;
  341. BrinMemTuple *dtup;
  342. BrinTuple *btup = NULL;
  343. Size btupsz = 0;
  344. opaque = (BrinOpaque *) scan->opaque;
  345. bdesc = opaque->bo_bdesc;
  346. pgstat_count_index_scan(idxRel);
  347. /*
  348. * We need to know the size of the table so that we know how long to
  349. * iterate on the revmap.
  350. */
  351. heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
  352. heapRel = heap_open(heapOid, AccessShareLock);
  353. nblocks = RelationGetNumberOfBlocks(heapRel);
  354. heap_close(heapRel, AccessShareLock);
  355. /*
  356. * Make room for the consistent support procedures of indexed columns. We
  357. * don't look them up here; we do that lazily the first time we see a scan
  358. * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
  359. */
  360. consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
  361. /* allocate an initial in-memory tuple, out of the per-range memcxt */
  362. dtup = brin_new_memtuple(bdesc);
  363. /*
  364. * Setup and use a per-range memory context, which is reset every time we
  365. * loop below. This avoids having to free the tuples within the loop.
  366. */
  367. perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
  368. "bringetbitmap cxt",
  369. ALLOCSET_DEFAULT_SIZES);
  370. oldcxt = MemoryContextSwitchTo(perRangeCxt);
  371. /*
  372. * Now scan the revmap. We start by querying for heap page 0,
  373. * incrementing by the number of pages per range; this gives us a full
  374. * view of the table.
  375. */
  376. for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
  377. {
  378. bool addrange;
  379. bool gottuple = false;
  380. BrinTuple *tup;
  381. OffsetNumber off;
  382. Size size;
  383. CHECK_FOR_INTERRUPTS();
  384. MemoryContextResetAndDeleteChildren(perRangeCxt);
  385. tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
  386. &off, &size, BUFFER_LOCK_SHARE,
  387. scan->xs_snapshot);
  388. if (tup)
  389. {
  390. gottuple = true;
  391. btup = brin_copy_tuple(tup, size, btup, &btupsz);
  392. LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  393. }
  394. /*
  395. * For page ranges with no indexed tuple, we must return the whole
  396. * range; otherwise, compare it to the scan keys.
  397. */
  398. if (!gottuple)
  399. {
  400. addrange = true;
  401. }
  402. else
  403. {
  404. dtup = brin_deform_tuple(bdesc, btup, dtup);
  405. if (dtup->bt_placeholder)
  406. {
  407. /*
  408. * Placeholder tuples are always returned, regardless of the
  409. * values stored in them.
  410. */
  411. addrange = true;
  412. }
  413. else
  414. {
  415. int keyno;
  416. /*
  417. * Compare scan keys with summary values stored for the range.
  418. * If scan keys are matched, the page range must be added to
  419. * the bitmap. We initially assume the range needs to be
  420. * added; in particular this serves the case where there are
  421. * no keys.
  422. */
  423. addrange = true;
  424. for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
  425. {
  426. ScanKey key = &scan->keyData[keyno];
  427. AttrNumber keyattno = key->sk_attno;
  428. BrinValues *bval = &dtup->bt_columns[keyattno - 1];
  429. Datum add;
  430. /*
  431. * The collation of the scan key must match the collation
  432. * used in the index column (but only if the search is not
  433. * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using
  434. * this index ...
  435. */
  436. Assert((key->sk_flags & SK_ISNULL) ||
  437. (key->sk_collation ==
  438. TupleDescAttr(bdesc->bd_tupdesc,
  439. keyattno - 1)->attcollation));
  440. /* First time this column? look up consistent function */
  441. if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
  442. {
  443. FmgrInfo *tmp;
  444. tmp = index_getprocinfo(idxRel, keyattno,
  445. BRIN_PROCNUM_CONSISTENT);
  446. fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
  447. CurrentMemoryContext);
  448. }
  449. /*
  450. * Check whether the scan key is consistent with the page
  451. * range values; if so, have the pages in the range added
  452. * to the output bitmap.
  453. *
  454. * When there are multiple scan keys, failure to meet the
  455. * criteria for a single one of them is enough to discard
  456. * the range as a whole, so break out of the loop as soon
  457. * as a false return value is obtained.
  458. */
  459. add = FunctionCall3Coll(&consistentFn[keyattno - 1],
  460. key->sk_collation,
  461. PointerGetDatum(bdesc),
  462. PointerGetDatum(bval),
  463. PointerGetDatum(key));
  464. addrange = DatumGetBool(add);
  465. if (!addrange)
  466. break;
  467. }
  468. }
  469. }
  470. /* add the pages in the range to the output bitmap, if needed */
  471. if (addrange)
  472. {
  473. BlockNumber pageno;
  474. for (pageno = heapBlk;
  475. pageno <= heapBlk + opaque->bo_pagesPerRange - 1;
  476. pageno++)
  477. {
  478. MemoryContextSwitchTo(oldcxt);
  479. tbm_add_page(tbm, pageno);
  480. totalpages++;
  481. MemoryContextSwitchTo(perRangeCxt);
  482. }
  483. }
  484. }
  485. MemoryContextSwitchTo(oldcxt);
  486. MemoryContextDelete(perRangeCxt);
  487. if (buf != InvalidBuffer)
  488. ReleaseBuffer(buf);
  489. /*
  490. * XXX We have an approximation of the number of *pages* that our scan
  491. * returns, but we don't have a precise idea of the number of heap tuples
  492. * involved.
  493. */
  494. return totalpages * 10;
  495. }
  496. /*
  497. * Re-initialize state for a BRIN index scan
  498. */
  499. void
  500. brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
  501. ScanKey orderbys, int norderbys)
  502. {
  503. /*
  504. * Other index AMs preprocess the scan keys at this point, or sometime
  505. * early during the scan; this lets them optimize by removing redundant
  506. * keys, or doing early returns when they are impossible to satisfy; see
  507. * _bt_preprocess_keys for an example. Something like that could be added
  508. * here someday, too.
  509. */
  510. if (scankey && scan->numberOfKeys > 0)
  511. memmove(scan->keyData, scankey,
  512. scan->numberOfKeys * sizeof(ScanKeyData));
  513. }
  514. /*
  515. * Close down a BRIN index scan
  516. */
  517. void
  518. brinendscan(IndexScanDesc scan)
  519. {
  520. BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
  521. brinRevmapTerminate(opaque->bo_rmAccess);
  522. brin_free_desc(opaque->bo_bdesc);
  523. pfree(opaque);
  524. }
  525. /*
  526. * Per-heap-tuple callback for IndexBuildHeapScan.
  527. *
  528. * Note we don't worry about the page range at the end of the table here; it is
  529. * present in the build state struct after we're called the last time, but not
  530. * inserted into the index. Caller must ensure to do so, if appropriate.
  531. */
  532. static void
  533. brinbuildCallback(Relation index,
  534. HeapTuple htup,
  535. Datum *values,
  536. bool *isnull,
  537. bool tupleIsAlive,
  538. void *brstate)
  539. {
  540. BrinBuildState *state = (BrinBuildState *) brstate;
  541. BlockNumber thisblock;
  542. int i;
  543. thisblock = ItemPointerGetBlockNumber(&htup->t_self);
  544. /*
  545. * If we're in a block that belongs to a future range, summarize what
  546. * we've got and start afresh. Note the scan might have skipped many
  547. * pages, if they were devoid of live tuples; make sure to insert index
  548. * tuples for those too.
  549. */
  550. while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
  551. {
  552. BRIN_elog((DEBUG2,
  553. "brinbuildCallback: completed a range: %u--%u",
  554. state->bs_currRangeStart,
  555. state->bs_currRangeStart + state->bs_pagesPerRange));
  556. /* create the index tuple and insert it */
  557. form_and_insert_tuple(state);
  558. /* set state to correspond to the next range */
  559. state->bs_currRangeStart += state->bs_pagesPerRange;
  560. /* re-initialize state for it */
  561. brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
  562. }
  563. /* Accumulate the current tuple into the running state */
  564. for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
  565. {
  566. FmgrInfo *addValue;
  567. BrinValues *col;
  568. Form_pg_attribute attr = TupleDescAttr(state->bs_bdesc->bd_tupdesc, i);
  569. col = &state->bs_dtuple->bt_columns[i];
  570. addValue = index_getprocinfo(index, i + 1,
  571. BRIN_PROCNUM_ADDVALUE);
  572. /*
  573. * Update dtuple state, if and as necessary.
  574. */
  575. FunctionCall4Coll(addValue,
  576. attr->attcollation,
  577. PointerGetDatum(state->bs_bdesc),
  578. PointerGetDatum(col),
  579. values[i], isnull[i]);
  580. }
  581. }
  582. /*
  583. * brinbuild() -- build a new BRIN index.
  584. */
  585. IndexBuildResult *
  586. brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
  587. {
  588. IndexBuildResult *result;
  589. double reltuples;
  590. double idxtuples;
  591. BrinRevmap *revmap;
  592. BrinBuildState *state;
  593. Buffer meta;
  594. BlockNumber pagesPerRange;
  595. /*
  596. * We expect to be called exactly once for any index relation.
  597. */
  598. if (RelationGetNumberOfBlocks(index) != 0)
  599. elog(ERROR, "index \"%s\" already contains data",
  600. RelationGetRelationName(index));
  601. /*
  602. * Critical section not required, because on error the creation of the
  603. * whole relation will be rolled back.
  604. */
  605. meta = ReadBuffer(index, P_NEW);
  606. Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
  607. LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
  608. brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
  609. BRIN_CURRENT_VERSION);
  610. MarkBufferDirty(meta);
  611. if (RelationNeedsWAL(index))
  612. {
  613. xl_brin_createidx xlrec;
  614. XLogRecPtr recptr;
  615. Page page;
  616. xlrec.version = BRIN_CURRENT_VERSION;
  617. xlrec.pagesPerRange = BrinGetPagesPerRange(index);
  618. XLogBeginInsert();
  619. XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
  620. XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
  621. recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
  622. page = BufferGetPage(meta);
  623. PageSetLSN(page, recptr);
  624. }
  625. UnlockReleaseBuffer(meta);
  626. /*
  627. * Initialize our state, including the deformed tuple state.
  628. */
  629. revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
  630. state = initialize_brin_buildstate(index, revmap, pagesPerRange);
  631. /*
  632. * Now scan the relation. No syncscan allowed here because we want the
  633. * heap blocks in physical order.
  634. */
  635. reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
  636. brinbuildCallback, (void *) state, NULL);
  637. /* process the final batch */
  638. form_and_insert_tuple(state);
  639. /* release resources */
  640. idxtuples = state->bs_numtuples;
  641. brinRevmapTerminate(state->bs_rmAccess);
  642. terminate_brin_buildstate(state);
  643. /*
  644. * Return statistics
  645. */
  646. result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
  647. result->heap_tuples = reltuples;
  648. result->index_tuples = idxtuples;
  649. return result;
  650. }
  651. void
  652. brinbuildempty(Relation index)
  653. {
  654. Buffer metabuf;
  655. /* An empty BRIN index has a metapage only. */
  656. metabuf =
  657. ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
  658. LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
  659. /* Initialize and xlog metabuffer. */
  660. START_CRIT_SECTION();
  661. brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
  662. BRIN_CURRENT_VERSION);
  663. MarkBufferDirty(metabuf);
  664. log_newpage_buffer(metabuf, true);
  665. END_CRIT_SECTION();
  666. UnlockReleaseBuffer(metabuf);
  667. }
  668. /*
  669. * brinbulkdelete
  670. * Since there are no per-heap-tuple index tuples in BRIN indexes,
  671. * there's not a lot we can do here.
  672. *
  673. * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
  674. * tuple is deleted), meaning the need to re-run summarization on the affected
  675. * range. Would need to add an extra flag in brintuples for that.
  676. */
  677. IndexBulkDeleteResult *
  678. brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  679. IndexBulkDeleteCallback callback, void *callback_state)
  680. {
  681. /* allocate stats if first time through, else re-use existing struct */
  682. if (stats == NULL)
  683. stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
  684. return stats;
  685. }
  686. /*
  687. * This routine is in charge of "vacuuming" a BRIN index: we just summarize
  688. * ranges that are currently unsummarized.
  689. */
  690. IndexBulkDeleteResult *
  691. brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  692. {
  693. Relation heapRel;
  694. /* No-op in ANALYZE ONLY mode */
  695. if (info->analyze_only)
  696. return stats;
  697. if (!stats)
  698. stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
  699. stats->num_pages = RelationGetNumberOfBlocks(info->index);
  700. /* rest of stats is initialized by zeroing */
  701. heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
  702. AccessShareLock);
  703. brin_vacuum_scan(info->index, info->strategy);
  704. brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
  705. &stats->num_index_tuples, &stats->num_index_tuples);
  706. heap_close(heapRel, AccessShareLock);
  707. return stats;
  708. }
  709. /*
  710. * reloptions processor for BRIN indexes
  711. */
  712. bytea *
  713. brinoptions(Datum reloptions, bool validate)
  714. {
  715. relopt_value *options;
  716. BrinOptions *rdopts;
  717. int numoptions;
  718. static const relopt_parse_elt tab[] = {
  719. {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
  720. {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
  721. };
  722. options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN,
  723. &numoptions);
  724. /* if none set, we're done */
  725. if (numoptions == 0)
  726. return NULL;
  727. rdopts = allocateReloptStruct(sizeof(BrinOptions), options, numoptions);
  728. fillRelOptions((void *) rdopts, sizeof(BrinOptions), options, numoptions,
  729. validate, tab, lengthof(tab));
  730. pfree(options);
  731. return (bytea *) rdopts;
  732. }
  733. /*
  734. * SQL-callable function to scan through an index and summarize all ranges
  735. * that are not currently summarized.
  736. */
  737. Datum
  738. brin_summarize_new_values(PG_FUNCTION_ARGS)
  739. {
  740. Datum relation = PG_GETARG_DATUM(0);
  741. return DirectFunctionCall2(brin_summarize_range,
  742. relation,
  743. Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
  744. }
  745. /*
  746. * SQL-callable function to summarize the indicated page range, if not already
  747. * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
  748. * unsummarized ranges are summarized.
  749. */
  750. Datum
  751. brin_summarize_range(PG_FUNCTION_ARGS)
  752. {
  753. Oid indexoid = PG_GETARG_OID(0);
  754. int64 heapBlk64 = PG_GETARG_INT64(1);
  755. BlockNumber heapBlk;
  756. Oid heapoid;
  757. Relation indexRel;
  758. Relation heapRel;
  759. double numSummarized = 0;
  760. if (RecoveryInProgress())
  761. ereport(ERROR,
  762. (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
  763. errmsg("recovery is in progress"),
  764. errhint("BRIN control functions cannot be executed during recovery.")));
  765. if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
  766. {
  767. char *blk = psprintf(INT64_FORMAT, heapBlk64);
  768. ereport(ERROR,
  769. (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
  770. errmsg("block number out of range: %s", blk)));
  771. }
  772. heapBlk = (BlockNumber) heapBlk64;
  773. /*
  774. * We must lock table before index to avoid deadlocks. However, if the
  775. * passed indexoid isn't an index then IndexGetRelation() will fail.
  776. * Rather than emitting a not-very-helpful error message, postpone
  777. * complaining, expecting that the is-it-an-index test below will fail.
  778. */
  779. heapoid = IndexGetRelation(indexoid, true);
  780. if (OidIsValid(heapoid))
  781. heapRel = heap_open(heapoid, ShareUpdateExclusiveLock);
  782. else
  783. heapRel = NULL;
  784. indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
  785. /* Must be a BRIN index */
  786. if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
  787. indexRel->rd_rel->relam != BRIN_AM_OID)
  788. ereport(ERROR,
  789. (errcode(ERRCODE_WRONG_OBJECT_TYPE),
  790. errmsg("\"%s\" is not a BRIN index",
  791. RelationGetRelationName(indexRel))));
  792. /* User must own the index (comparable to privileges needed for VACUUM) */
  793. if (!pg_class_ownercheck(indexoid, GetUserId()))
  794. aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
  795. RelationGetRelationName(indexRel));
  796. /*
  797. * Since we did the IndexGetRelation call above without any lock, it's
  798. * barely possible that a race against an index drop/recreation could have
  799. * netted us the wrong table. Recheck.
  800. */
  801. if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
  802. ereport(ERROR,
  803. (errcode(ERRCODE_UNDEFINED_TABLE),
  804. errmsg("could not open parent table of index %s",
  805. RelationGetRelationName(indexRel))));
  806. /* OK, do it */
  807. brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
  808. relation_close(indexRel, ShareUpdateExclusiveLock);
  809. relation_close(heapRel, ShareUpdateExclusiveLock);
  810. PG_RETURN_INT32((int32) numSummarized);
  811. }
  812. /*
  813. * SQL-callable interface to mark a range as no longer summarized
  814. */
  815. Datum
  816. brin_desummarize_range(PG_FUNCTION_ARGS)
  817. {
  818. Oid indexoid = PG_GETARG_OID(0);
  819. int64 heapBlk64 = PG_GETARG_INT64(1);
  820. BlockNumber heapBlk;
  821. Oid heapoid;
  822. Relation heapRel;
  823. Relation indexRel;
  824. bool done;
  825. if (RecoveryInProgress())
  826. ereport(ERROR,
  827. (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
  828. errmsg("recovery is in progress"),
  829. errhint("BRIN control functions cannot be executed during recovery.")));
  830. if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
  831. {
  832. char *blk = psprintf(INT64_FORMAT, heapBlk64);
  833. ereport(ERROR,
  834. (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
  835. errmsg("block number out of range: %s", blk)));
  836. }
  837. heapBlk = (BlockNumber) heapBlk64;
  838. /*
  839. * We must lock table before index to avoid deadlocks. However, if the
  840. * passed indexoid isn't an index then IndexGetRelation() will fail.
  841. * Rather than emitting a not-very-helpful error message, postpone
  842. * complaining, expecting that the is-it-an-index test below will fail.
  843. */
  844. heapoid = IndexGetRelation(indexoid, true);
  845. if (OidIsValid(heapoid))
  846. heapRel = heap_open(heapoid, ShareUpdateExclusiveLock);
  847. else
  848. heapRel = NULL;
  849. indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
  850. /* Must be a BRIN index */
  851. if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
  852. indexRel->rd_rel->relam != BRIN_AM_OID)
  853. ereport(ERROR,
  854. (errcode(ERRCODE_WRONG_OBJECT_TYPE),
  855. errmsg("\"%s\" is not a BRIN index",
  856. RelationGetRelationName(indexRel))));
  857. /* User must own the index (comparable to privileges needed for VACUUM) */
  858. if (!pg_class_ownercheck(indexoid, GetUserId()))
  859. aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
  860. RelationGetRelationName(indexRel));
  861. /*
  862. * Since we did the IndexGetRelation call above without any lock, it's
  863. * barely possible that a race against an index drop/recreation could have
  864. * netted us the wrong table. Recheck.
  865. */
  866. if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
  867. ereport(ERROR,
  868. (errcode(ERRCODE_UNDEFINED_TABLE),
  869. errmsg("could not open parent table of index %s",
  870. RelationGetRelationName(indexRel))));
  871. /* the revmap does the hard work */
  872. do
  873. {
  874. done = brinRevmapDesummarizeRange(indexRel, heapBlk);
  875. }
  876. while (!done);
  877. relation_close(indexRel, ShareUpdateExclusiveLock);
  878. relation_close(heapRel, ShareUpdateExclusiveLock);
  879. PG_RETURN_VOID();
  880. }
  881. /*
  882. * Build a BrinDesc used to create or scan a BRIN index
  883. */
  884. BrinDesc *
  885. brin_build_desc(Relation rel)
  886. {
  887. BrinOpcInfo **opcinfo;
  888. BrinDesc *bdesc;
  889. TupleDesc tupdesc;
  890. int totalstored = 0;
  891. int keyno;
  892. long totalsize;
  893. MemoryContext cxt;
  894. MemoryContext oldcxt;
  895. cxt = AllocSetContextCreate(CurrentMemoryContext,
  896. "brin desc cxt",
  897. ALLOCSET_SMALL_SIZES);
  898. oldcxt = MemoryContextSwitchTo(cxt);
  899. tupdesc = RelationGetDescr(rel);
  900. /*
  901. * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
  902. * the number of columns stored, since the number is opclass-defined.
  903. */
  904. opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts);
  905. for (keyno = 0; keyno < tupdesc->natts; keyno++)
  906. {
  907. FmgrInfo *opcInfoFn;
  908. Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
  909. opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
  910. opcinfo[keyno] = (BrinOpcInfo *)
  911. DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
  912. totalstored += opcinfo[keyno]->oi_nstored;
  913. }
  914. /* Allocate our result struct and fill it in */
  915. totalsize = offsetof(BrinDesc, bd_info) +
  916. sizeof(BrinOpcInfo *) * tupdesc->natts;
  917. bdesc = palloc(totalsize);
  918. bdesc->bd_context = cxt;
  919. bdesc->bd_index = rel;
  920. bdesc->bd_tupdesc = tupdesc;
  921. bdesc->bd_disktdesc = NULL; /* generated lazily */
  922. bdesc->bd_totalstored = totalstored;
  923. for (keyno = 0; keyno < tupdesc->natts; keyno++)
  924. bdesc->bd_info[keyno] = opcinfo[keyno];
  925. pfree(opcinfo);
  926. MemoryContextSwitchTo(oldcxt);
  927. return bdesc;
  928. }
  929. void
  930. brin_free_desc(BrinDesc *bdesc)
  931. {
  932. /* make sure the tupdesc is still valid */
  933. Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
  934. /* no need for retail pfree */
  935. MemoryContextDelete(bdesc->bd_context);
  936. }
  937. /*
  938. * Fetch index's statistical data into *stats
  939. */
  940. void
  941. brinGetStats(Relation index, BrinStatsData *stats)
  942. {
  943. Buffer metabuffer;
  944. Page metapage;
  945. BrinMetaPageData *metadata;
  946. metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
  947. LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
  948. metapage = BufferGetPage(metabuffer);
  949. metadata = (BrinMetaPageData *) PageGetContents(metapage);
  950. stats->pagesPerRange = metadata->pagesPerRange;
  951. stats->revmapNumPages = metadata->lastRevmapPage - 1;
  952. UnlockReleaseBuffer(metabuffer);
  953. }
  954. /*
  955. * Initialize a BrinBuildState appropriate to create tuples on the given index.
  956. */
  957. static BrinBuildState *
  958. initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
  959. BlockNumber pagesPerRange)
  960. {
  961. BrinBuildState *state;
  962. state = palloc(sizeof(BrinBuildState));
  963. state->bs_irel = idxRel;
  964. state->bs_numtuples = 0;
  965. state->bs_currentInsertBuf = InvalidBuffer;
  966. state->bs_pagesPerRange = pagesPerRange;
  967. state->bs_currRangeStart = 0;
  968. state->bs_rmAccess = revmap;
  969. state->bs_bdesc = brin_build_desc(idxRel);
  970. state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
  971. brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
  972. return state;
  973. }
  974. /*
  975. * Release resources associated with a BrinBuildState.
  976. */
  977. static void
  978. terminate_brin_buildstate(BrinBuildState *state)
  979. {
  980. /*
  981. * Release the last index buffer used. We might as well ensure that
  982. * whatever free space remains in that page is available in FSM, too.
  983. */
  984. if (!BufferIsInvalid(state->bs_currentInsertBuf))
  985. {
  986. Page page;
  987. Size freespace;
  988. BlockNumber blk;
  989. page = BufferGetPage(state->bs_currentInsertBuf);
  990. freespace = PageGetFreeSpace(page);
  991. blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
  992. ReleaseBuffer(state->bs_currentInsertBuf);
  993. RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
  994. FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
  995. }
  996. brin_free_desc(state->bs_bdesc);
  997. pfree(state->bs_dtuple);
  998. pfree(state);
  999. }
  1000. /*
  1001. * On the given BRIN index, summarize the heap page range that corresponds
  1002. * to the heap block number given.
  1003. *
  1004. * This routine can run in parallel with insertions into the heap. To avoid
  1005. * missing those values from the summary tuple, we first insert a placeholder
  1006. * index tuple into the index, then execute the heap scan; transactions
  1007. * concurrent with the scan update the placeholder tuple. After the scan, we
  1008. * union the placeholder tuple with the one computed by this routine. The
  1009. * update of the index value happens in a loop, so that if somebody updates
  1010. * the placeholder tuple after we read it, we detect the case and try again.
  1011. * This ensures that the concurrently inserted tuples are not lost.
  1012. *
  1013. * A further corner case is this routine being asked to summarize the partial
  1014. * range at the end of the table. heapNumBlocks is the (possibly outdated)
  1015. * table size; if we notice that the requested range lies beyond that size,
  1016. * we re-compute the table size after inserting the placeholder tuple, to
  1017. * avoid missing pages that were appended recently.
  1018. */
  1019. static void
  1020. summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
  1021. BlockNumber heapBlk, BlockNumber heapNumBlks)
  1022. {
  1023. Buffer phbuf;
  1024. BrinTuple *phtup;
  1025. Size phsz;
  1026. OffsetNumber offset;
  1027. BlockNumber scanNumBlks;
  1028. /*
  1029. * Insert the placeholder tuple
  1030. */
  1031. phbuf = InvalidBuffer;
  1032. phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
  1033. offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
  1034. state->bs_rmAccess, &phbuf,
  1035. heapBlk, phtup, phsz);
  1036. /*
  1037. * Compute range end. We hold ShareUpdateExclusive lock on table, so it
  1038. * cannot shrink concurrently (but it can grow).
  1039. */
  1040. Assert(heapBlk % state->bs_pagesPerRange == 0);
  1041. if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
  1042. {
  1043. /*
  1044. * If we're asked to scan what we believe to be the final range on the
  1045. * table (i.e. a range that might be partial) we need to recompute our
  1046. * idea of what the latest page is after inserting the placeholder
  1047. * tuple. Anyone that grows the table later will update the
  1048. * placeholder tuple, so it doesn't matter that we won't scan these
  1049. * pages ourselves. Careful: the table might have been extended
  1050. * beyond the current range, so clamp our result.
  1051. *
  1052. * Fortunately, this should occur infrequently.
  1053. */
  1054. scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
  1055. state->bs_pagesPerRange);
  1056. }
  1057. else
  1058. {
  1059. /* Easy case: range is known to be complete */
  1060. scanNumBlks = state->bs_pagesPerRange;
  1061. }
  1062. /*
  1063. * Execute the partial heap scan covering the heap blocks in the specified
  1064. * page range, summarizing the heap tuples in it. This scan stops just
  1065. * short of brinbuildCallback creating the new index entry.
  1066. *
  1067. * Note that it is critical we use the "any visible" mode of
  1068. * IndexBuildHeapRangeScan here: otherwise, we would miss tuples inserted
  1069. * by transactions that are still in progress, among other corner cases.
  1070. */
  1071. state->bs_currRangeStart = heapBlk;
  1072. IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, true,
  1073. heapBlk, scanNumBlks,
  1074. brinbuildCallback, (void *) state, NULL);
  1075. /*
  1076. * Now we update the values obtained by the scan with the placeholder
  1077. * tuple. We do this in a loop which only terminates if we're able to
  1078. * update the placeholder tuple successfully; if we are not, this means
  1079. * somebody else modified the placeholder tuple after we read it.
  1080. */
  1081. for (;;)
  1082. {
  1083. BrinTuple *newtup;
  1084. Size newsize;
  1085. bool didupdate;
  1086. bool samepage;
  1087. CHECK_FOR_INTERRUPTS();
  1088. /*
  1089. * Update the summary tuple and try to update.
  1090. */
  1091. newtup = brin_form_tuple(state->bs_bdesc,
  1092. heapBlk, state->bs_dtuple, &newsize);
  1093. samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
  1094. didupdate =
  1095. brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
  1096. state->bs_rmAccess, heapBlk, phbuf, offset,
  1097. phtup, phsz, newtup, newsize, samepage);
  1098. brin_free_tuple(phtup);
  1099. brin_free_tuple(newtup);
  1100. /* If the update succeeded, we're done. */
  1101. if (didupdate)
  1102. break;
  1103. /*
  1104. * If the update didn't work, it might be because somebody updated the
  1105. * placeholder tuple concurrently. Extract the new version, union it
  1106. * with the values we have from the scan, and start over. (There are
  1107. * other reasons for the update to fail, but it's simple to treat them
  1108. * the same.)
  1109. */
  1110. phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
  1111. &offset, &phsz, BUFFER_LOCK_SHARE,
  1112. NULL);
  1113. /* the placeholder tuple must exist */
  1114. if (phtup == NULL)
  1115. elog(ERROR, "missing placeholder tuple");
  1116. phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
  1117. LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
  1118. /* merge it into the tuple from the heap scan */
  1119. union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
  1120. }
  1121. ReleaseBuffer(phbuf);
  1122. }
  1123. /*
  1124. * Summarize page ranges that are not already summarized. If pageRange is
  1125. * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
  1126. * page range containing the given heap page number is scanned.
  1127. * If include_partial is true, then the partial range at the end of the table
  1128. * is summarized, otherwise not.
  1129. *
  1130. * For each new index tuple inserted, *numSummarized (if not NULL) is
  1131. * incremented; for each existing tuple, *numExisting (if not NULL) is
  1132. * incremented.
  1133. */
  1134. static void
  1135. brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
  1136. bool include_partial, double *numSummarized, double *numExisting)
  1137. {
  1138. BrinRevmap *revmap;
  1139. BrinBuildState *state = NULL;
  1140. IndexInfo *indexInfo = NULL;
  1141. BlockNumber heapNumBlocks;
  1142. BlockNumber pagesPerRange;
  1143. Buffer buf;
  1144. BlockNumber startBlk;
  1145. revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
  1146. /* determine range of pages to process */
  1147. heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
  1148. if (pageRange == BRIN_ALL_BLOCKRANGES)
  1149. startBlk = 0;
  1150. else
  1151. {
  1152. startBlk = (pageRange / pagesPerRange) * pagesPerRange;
  1153. heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
  1154. }
  1155. if (startBlk > heapNumBlocks)
  1156. {
  1157. /* Nothing to do if start point is beyond end of table */
  1158. brinRevmapTerminate(revmap);
  1159. return;
  1160. }
  1161. /*
  1162. * Scan the revmap to find unsummarized items.
  1163. */
  1164. buf = InvalidBuffer;
  1165. for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
  1166. {
  1167. BrinTuple *tup;
  1168. OffsetNumber off;
  1169. /*
  1170. * Unless requested to summarize even a partial range, go away now if
  1171. * we think the next range is partial. Caller would pass true when it
  1172. * is typically run once bulk data loading is done
  1173. * (brin_summarize_new_values), and false when it is typically the
  1174. * result of arbitrarily-scheduled maintenance command (vacuuming).
  1175. */
  1176. if (!include_partial &&
  1177. (startBlk + pagesPerRange > heapNumBlocks))
  1178. break;
  1179. CHECK_FOR_INTERRUPTS();
  1180. tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
  1181. BUFFER_LOCK_SHARE, NULL);
  1182. if (tup == NULL)
  1183. {
  1184. /* no revmap entry for this heap range. Summarize it. */
  1185. if (state == NULL)
  1186. {
  1187. /* first time through */
  1188. Assert(!indexInfo);
  1189. state = initialize_brin_buildstate(index, revmap,
  1190. pagesPerRange);
  1191. indexInfo = BuildIndexInfo(index);
  1192. }
  1193. summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
  1194. /* and re-initialize state for the next range */
  1195. brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
  1196. if (numSummarized)
  1197. *numSummarized += 1.0;
  1198. }
  1199. else
  1200. {
  1201. if (numExisting)
  1202. *numExisting += 1.0;
  1203. LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  1204. }
  1205. }
  1206. if (BufferIsValid(buf))
  1207. ReleaseBuffer(buf);
  1208. /* free resources */
  1209. brinRevmapTerminate(revmap);
  1210. if (state)
  1211. {
  1212. terminate_brin_buildstate(state);
  1213. pfree(indexInfo);
  1214. }
  1215. }
  1216. /*
  1217. * Given a deformed tuple in the build state, convert it into the on-disk
  1218. * format and insert it into the index, making the revmap point to it.
  1219. */
  1220. static void
  1221. form_and_insert_tuple(BrinBuildState *state)
  1222. {
  1223. BrinTuple *tup;
  1224. Size size;
  1225. tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
  1226. state->bs_dtuple, &size);
  1227. brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
  1228. &state->bs_currentInsertBuf, state->bs_currRangeStart,
  1229. tup, size);
  1230. state->bs_numtuples++;
  1231. pfree(tup);
  1232. }
  1233. /*
  1234. * Given two deformed tuples, adjust the first one so that it's consistent
  1235. * with the summary values in both.
  1236. */
  1237. static void
  1238. union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
  1239. {
  1240. int keyno;
  1241. BrinMemTuple *db;
  1242. MemoryContext cxt;
  1243. MemoryContext oldcxt;
  1244. /* Use our own memory context to avoid retail pfree */
  1245. cxt = AllocSetContextCreate(CurrentMemoryContext,
  1246. "brin union",
  1247. ALLOCSET_DEFAULT_SIZES);
  1248. oldcxt = MemoryContextSwitchTo(cxt);
  1249. db = brin_deform_tuple(bdesc, b, NULL);
  1250. MemoryContextSwitchTo(oldcxt);
  1251. for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
  1252. {
  1253. FmgrInfo *unionFn;
  1254. BrinValues *col_a = &a->bt_columns[keyno];
  1255. BrinValues *col_b = &db->bt_columns[keyno];
  1256. unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
  1257. BRIN_PROCNUM_UNION);
  1258. FunctionCall3Coll(unionFn,
  1259. bdesc->bd_index->rd_indcollation[keyno],
  1260. PointerGetDatum(bdesc),
  1261. PointerGetDatum(col_a),
  1262. PointerGetDatum(col_b));
  1263. }
  1264. MemoryContextDelete(cxt);
  1265. }
  1266. /*
  1267. * brin_vacuum_scan
  1268. * Do a complete scan of the index during VACUUM.
  1269. *
  1270. * This routine scans the complete index looking for uncatalogued index pages,
  1271. * i.e. those that might have been lost due to a crash after index extension
  1272. * and such.
  1273. */
  1274. static void
  1275. brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
  1276. {
  1277. BlockNumber nblocks;
  1278. BlockNumber blkno;
  1279. /*
  1280. * Scan the index in physical order, and clean up any possible mess in
  1281. * each page.
  1282. */
  1283. nblocks = RelationGetNumberOfBlocks(idxrel);
  1284. for (blkno = 0; blkno < nblocks; blkno++)
  1285. {
  1286. Buffer buf;
  1287. CHECK_FOR_INTERRUPTS();
  1288. buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
  1289. RBM_NORMAL, strategy);
  1290. brin_page_cleanup(idxrel, buf);
  1291. ReleaseBuffer(buf);
  1292. }
  1293. /*
  1294. * Update all upper pages in the index's FSM, as well. This ensures not
  1295. * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
  1296. * but also that any pre-existing damage or out-of-dateness is repaired.
  1297. */
  1298. FreeSpaceMapVacuum(idxrel);
  1299. }