#define MaxFSMRequestSize MaxHeapTupleSize
#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData)))
#define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
#define FSM_CATEGORIES 256
//塊大小為8K則FSM_CAT_STEP = 32
#define SlotsPerFSMPage LeafNodesPerPage
#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) = 8156 - 4095 = 4061
#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \offsetof(FSMPageData, fp_nodes)) = 8192 - 32 - 4 = 8156
#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) = 4095
/** Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,* and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,* 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,* this means that 4096 bytes is the smallest BLCKSZ that we can get away* with a 3-level tree, and 512 is the smallest we support.* 存儲在磁盤上的樹深度.* 我們需要為2^32 - 1個塊定位尋址,1626是可以滿足X^3 >= 2^32 - 1的最小數字.* 另外,216是可以滿足X^4 >= 2^32 - 1的最小數字.* 在實踐中,這意味著4096字節是三層數可以支撐的最小BLCKSZ,512是最小可支持的.*/
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
/** The internal FSM routines work on a logical addressing scheme. Each* level of the tree can be thought of as a separately addressable file.* 內部的FSM處理過程工作在一個邏輯地址scheme上.* 樹的每一個層次都可以認為是一個獨立的地址文件.*/
typedef struct
{//層次int level; /* level *///該層次內的頁編號int logpageno; /* page number within the level */
} FSMAddress;
/* Address of the root page. */
//根頁地址
static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
/** Structure of a FSM page. See src/backend/storage/freespace/README for* details.* FSM page數據結構.詳細可參看src/backend/storage/freespace/README.*/
typedef struct
{/** fsm_search_avail() tries to spread the load of multiple backends by* returning different pages to different backends in a round-robin* fashion. fp_next_slot points to the next slot to be returned (assuming* there's enough space on it for the request). It's defined as an int,* because it's updated without an exclusive lock. uint16 would be more* appropriate, but int is more likely to be atomically* fetchable/storable.* fsm_search_avail()函數嘗試通過在一輪循環中返回不同的頁面到不同的后臺進程,* 從而分散在后臺進程上分散負載.* 該字段因為無需獨占鎖,因此定義為整型.* unit16可能會更合適,但整型看起來更適合于原子提取和存儲.*/int fp_next_slot;/** fp_nodes contains the binary tree, stored in array. The first* NonLeafNodesPerPage elements are upper nodes, and the following* LeafNodesPerPage elements are leaf nodes. Unused nodes are zero.* fp_nodes以數組的形式存儲二叉樹.* 第一個NonLeafNodesPerPage元素是上一層的節點,接下來的LeafNodesPerPage元素是葉子節點.* 未使用的節點為0.*/uint8 fp_nodes[FLEXIBLE_ARRAY_MEMBER];
} FSMPageData;
typedef FSMPageData *FSMPage;
FSMLocalMap 對于小表,不需要創建FSM來存儲空間信息,使用本地的內存映射信息.
/* Either already tried, or beyond the end of the relation */
//已嘗試或者已在表的末尾之后
#define FSM_LOCAL_NOT_AVAIL 0x00
/* Available to try */
//可用于嘗試
#define FSM_LOCAL_AVAIL 0x01
/** For small relations, we don't create FSM to save space, instead we use* local in-memory map of pages to try. To locate free space, we simply try* pages directly without knowing ahead of time how much free space they have.* 對于小表,不需要創建FSM來存儲空間信息,使用本地的內存映射信息.* 為了定位空閑空間,我們不需要知道他們有多少空閑空間而是直接簡單的對page進行嘗試.** Note that this map is used to the find the block with required free space* for any given relation. We clear this map when we have found a block with* enough free space, when we extend the relation, or on transaction abort.* See src/backend/storage/freespace/README for further details.* 注意這個map用于搜索給定表的請求空閑空間.* 在找到有足夠空閑空間的block/擴展了relation/在事務回滾時,則清除這個map的信息.* 詳細可查看src/backend/storage/freespace/README.*/
typedef struct
{BlockNumber nblocks;//塊數uint8 map[HEAP_FSM_CREATION_THRESHOLD];//數組
} FSMLocalMap;
static FSMLocalMap fsm_local_map =
{0,{FSM_LOCAL_NOT_AVAIL}
};
#define FSM_LOCAL_MAP_EXISTS (fsm_local_map.nblocks > 0)
通用例程 包括獲取左子節點/右子節點/父節點等
/* Macros to navigate the tree within a page. Root has index zero. */
//在page中遍歷樹.Root編號為0
#define leftchild(x) (2 * (x) + 1)
#define rightchild(x) (2 * (x) + 2)
#define parentof(x) (((x) - 1) / 2)
/** Find right neighbor of x, wrapping around within the level* 搜索x右邊的鄰居,如需要在同一個層次上需回卷*/
static int
rightneighbor(int x)
{/** Move right. This might wrap around, stepping to the leftmost node at* the next level.* 移到右邊.這可能會引起回卷,調到下一個層次最左邊的節點上.*/x++;/** Check if we stepped to the leftmost node at next level, and correct if* so. The leftmost nodes at each level are numbered x = 2^level - 1, so* check if (x + 1) is a power of two, using a standard* twos-complement-arithmetic trick.* 檢查是否跳轉到下一個層次最左邊的節點上,如是則修正x.* 每一個層次上最左邊的節點編號為x = 2^level - 1,* 因此檢查(x+1)是否為2的冪,使用標準的twos-complement-arithmetic技巧即可.*/if (((x + 1) & x) == 0)//有符號整型,全1為0x = parentof(x);return x;
}
/** Returns the physical block number of a FSM page* 返回FSM page的物理塊號*/
/*
計算公式:
To find the physical block # corresponding to leaf page n, we need to
count the number of leaf and upper-level pages preceding page n.
This turns out to be
y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1
where F is the fanout . The first term n is the number
of preceding leaf pages, the second term is the number of pages at level 1,
and so forth.
*/
static BlockNumber
fsm_logical_to_physical(FSMAddress addr)
{BlockNumber pages;//塊號int leafno;//頁號int l;//臨時變量/** Calculate the logical page number of the first leaf page below the* given page.* 在給定的page下,計算第一個葉子頁面的邏輯頁號*/leafno = addr.logpageno;for (l = 0; l < addr.level; l++)leafno *= SlotsPerFSMPage;/* Count upper level nodes required to address the leaf page *///統計用于定位葉子頁面的上層節點數pages = 0;for (l = 0; l < FSM_TREE_DEPTH; l++){pages += leafno + 1;leafno /= SlotsPerFSMPage;}/** If the page we were asked for wasn't at the bottom level, subtract the* additional lower level pages we counted above.* 如果請求的頁面不在底層,減去上面技術的額外的底層頁面數.*/pages -= addr.level;/* Turn the page count into 0-based block number *///計數從0開始(減一)return pages - 1;
}/** Return the FSM location corresponding to given heap block.* 返回給定堆block的FSM位置.*/
//addr = fsm_get_location(oldPage, &slot);
static FSMAddress
fsm_get_location(BlockNumber heapblk, uint16 *slot)
{FSMAddress addr;addr.level = FSM_BOTTOM_LEVEL;//#define SlotsPerFSMPage LeafNodesPerPage//#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) = 8156 - 4095 = 4061//#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \offsetof(FSMPageData, fp_nodes)) = 8192 - 32 - 4 = 8156//#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) = 4095addr.logpageno = heapblk / SlotsPerFSMPage;*slot = heapblk % SlotsPerFSMPage;return addr;
}
二、源碼解讀
fsm_search函數搜索FSM,找到有足夠空閑空間(min_cat)的堆page.
/** Search the tree for a heap page with at least min_cat of free space* 搜索FSM,找到有足夠空閑空間(min_cat)的堆page*/
//return fsm_search(rel, search_cat);
static BlockNumber
fsm_search(Relation rel, uint8 min_cat)
{int restarts = 0;FSMAddress addr = FSM_ROOT_ADDRESS;for (;;){//--------- 循環int slot;Buffer buf;uint8 max_avail = 0;/* Read the FSM page. *///讀取FSM pagebuf = fsm_readbuf(rel, addr, false);/* Search within the page *///頁內搜索if (BufferIsValid(buf)){LockBuffer(buf, BUFFER_LOCK_SHARE);//搜索可用的slotslot = fsm_search_avail(buf, min_cat,(addr.level == FSM_BOTTOM_LEVEL),false);if (slot == -1)//獲取最大可用空間max_avail = fsm_get_max_avail(BufferGetPage(buf));UnlockReleaseBuffer(buf);}else//buffer無效,則設置為-1slot = -1;if (slot != -1){/** Descend the tree, or return the found block if we're at the* bottom.* 如在樹的底部,則返回找到的塊.*/if (addr.level == FSM_BOTTOM_LEVEL)return fsm_get_heap_blk(addr, slot);addr = fsm_get_child(addr, slot);}else if (addr.level == FSM_ROOT_LEVEL){/** At the root, failure means there's no page with enough free* space in the FSM. Give up.* 處于根節點,失敗意味著FSM中沒有足夠空閑空間的頁面存在,放棄.*/return InvalidBlockNumber;}else{uint16 parentslot;FSMAddress parent;/** At lower level, failure can happen if the value in the upper-* level node didn't reflect the value on the lower page. Update* the upper node, to avoid falling into the same trap again, and* start over.* 在低層上,如果上層節點沒有反映更低層頁面的值則會出現失敗.* 更新高層節點,避免重復掉入同一個陷阱,重新開始.** There's a race condition here, if another backend updates this* page right after we release it, and gets the lock on the parent* page before us. We'll then update the parent page with the now* stale information we had. It's OK, because it should happen* rarely, and will be fixed by the next vacuum.* 在我們釋放后,另外的后臺進程更新這個頁面同時在我們之前鎖定了父節點的話,會存在條件競爭.* 然后我們使用現有已知穩定的信息更新父頁面.* 如OK,因為這種很少出現,那么會在下一個vacuum中修復此問題.*/parent = fsm_get_parent(addr, &parentslot);fsm_set_and_search(rel, parent, parentslot, max_avail, 0);/** If the upper pages are badly out of date, we might need to loop* quite a few times, updating them as we go. Any inconsistencies* should eventually be corrected and the loop should end. Looping* indefinitely is nevertheless scary, so provide an emergency* valve.* 如果上層頁面過舊,可能需要循環很多次,更新上層頁面信息.* 不一致性會被周期性的糾正,循環會停止.* 但無限循環是很可怕的,因此設置閾值,超過此閾值則退出循環.*/if (restarts++ > 10000)return InvalidBlockNumber;/* Start search all over from the root *///從root開始搜索addr = FSM_ROOT_ADDRESS;}}
}
三、跟蹤分析
N/A
四、參考資料
PG Source Code Database Cluster, Databases, and Tables free space map README