1/* 2 * fs/logfs/logfs_abi.h 3 * 4 * As should be obvious for Linux kernel code, license is GPLv2 5 * 6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> 7 * 8 * Public header for logfs. 9 */ 10#ifndef FS_LOGFS_LOGFS_ABI_H 11#define FS_LOGFS_LOGFS_ABI_H 12 13/* For out-of-kernel compiles */ 14#ifndef BUILD_BUG_ON 15#define BUILD_BUG_ON(condition) /**/ 16#endif 17 18#define SIZE_CHECK(type, size) \ 19static inline void check_##type(void) \ 20{ \ 21 BUILD_BUG_ON(sizeof(struct type) != (size)); \ 22} 23 24/* 25 * Throughout the logfs code, we're constantly dealing with blocks at 26 * various positions or offsets. To remove confusion, we stricly 27 * distinguish between a "position" - the logical position within a 28 * file and an "offset" - the physical location within the device. 29 * 30 * Any usage of the term offset for a logical location or position for 31 * a physical one is a bug and should get fixed. 32 */ 33 34/* 35 * Block are allocated in one of several segments depending on their 36 * level. The following levels are used: 37 * 0 - regular data block 38 * 1 - i1 indirect blocks 39 * 2 - i2 indirect blocks 40 * 3 - i3 indirect blocks 41 * 4 - i4 indirect blocks 42 * 5 - i5 indirect blocks 43 * 6 - ifile data blocks 44 * 7 - ifile i1 indirect blocks 45 * 8 - ifile i2 indirect blocks 46 * 9 - ifile i3 indirect blocks 47 * 10 - ifile i4 indirect blocks 48 * 11 - ifile i5 indirect blocks 49 * Potential levels to be used in the future: 50 * 12 - gc recycled blocks, long-lived data 51 * 13 - replacement blocks, short-lived data 52 * 53 * Levels 1-11 are necessary for robust gc operations and help separate 54 * short-lived metadata from longer-lived file data. In the future, 55 * file data should get separated into several segments based on simple 56 * heuristics. Old data recycled during gc operation is expected to be 57 * long-lived. New data is of uncertain life expectancy. New data 58 * used to replace older blocks in existing files is expected to be 59 * short-lived. 60 */ 61 62 63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ 64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull 65#define LOGFS_MAGIC_U32 0xc97e8168u 66 67/* 68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB. 69 * Sooner or later that should become configurable and the macros replaced 70 * by something superblock-dependent. Pointers in indirect blocks are and 71 * will remain 64bit. 72 * 73 * LOGFS_BLOCKSIZE - self-explaining 74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block 75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts 76 */ 77#define LOGFS_BLOCKSIZE (4096ull) 78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) 79#define LOGFS_BLOCK_BITS (9) 80 81/* 82 * Number of blocks at various levels of indirection. There are 16 direct 83 * block pointers plus a single indirect pointer. 84 */ 85#define I0_BLOCKS (16) 86#define I1_BLOCKS LOGFS_BLOCK_FACTOR 87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) 88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) 89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) 90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) 91 92#define INDIRECT_INDEX I0_BLOCKS 93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) 94 95/* 96 * Sizes at which files require another level of indirection. Files smaller 97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, 98 * similar like ext2 fast symlinks. 99 * 100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the 101 * direct pointers, else through the 1x indirect pointer and so forth. 102 */ 103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) 104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) 105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) 106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) 107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) 108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) 109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) 110 111/* 112 * Each indirect block pointer must have this flag set, if all block pointers 113 * behind it are set, i.e. there is no hole hidden in the shadow of this 114 * indirect block pointer. 115 */ 116#define LOGFS_FULLY_POPULATED (1ULL << 63) 117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) 118 119/* 120 * LogFS needs to separate data into levels. Each level is defined as the 121 * maximal possible distance from the master inode (inode of the inode file). 122 * Data blocks reside on level 0, 1x indirect block on level 1, etc. 123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. 124 * This effort is necessary to guarantee garbage collection to always make 125 * progress. 126 * 127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, 128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is 129 * the maximal number of levels for one file. 130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are 131 * effectively stacked on top of each other. 132 */ 133#define LOGFS_MAX_INDIRECT (5) 134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) 135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) 136 137/* Maximum size of filenames */ 138#define LOGFS_MAX_NAMELEN (255) 139 140/* Number of segments in the primary journal. */ 141#define LOGFS_JOURNAL_SEGS (16) 142 143/* Maximum number of free/erased/etc. segments in journal entries */ 144#define MAX_CACHED_SEGS (64) 145 146 147/* 148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, 149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including 150 * its header, 151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for 152 * its segment header and the padded space at the end when no further objects 153 * fit. 154 */ 155#define LOGFS_OBJECT_HEADERSIZE (0x1c) 156#define LOGFS_SEGMENT_HEADERSIZE (0x18) 157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) 158#define LOGFS_SEGMENT_RESERVE \ 159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) 160 161/* 162 * Segment types: 163 * SEG_SUPER - Data or indirect block 164 * SEG_JOURNAL - Inode 165 * SEG_OSTORE - Dentry 166 */ 167enum { 168 SEG_SUPER = 0x01, 169 SEG_JOURNAL = 0x02, 170 SEG_OSTORE = 0x03, 171}; 172 173/** 174 * struct logfs_segment_header - per-segment header in the ostore 175 * 176 * @crc: crc32 of header (there is no data) 177 * @pad: unused, must be 0 178 * @type: segment type, see above 179 * @level: GC level for all objects in this segment 180 * @segno: segment number 181 * @ec: erase count for this segment 182 * @gec: global erase count at time of writing 183 */ 184struct logfs_segment_header { 185 __be32 crc; 186 __be16 pad; 187 __u8 type; 188 __u8 level; 189 __be32 segno; 190 __be32 ec; 191 __be64 gec; 192}; 193 194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); 195 196#define LOGFS_FEATURES_INCOMPAT (0ull) 197#define LOGFS_FEATURES_RO_COMPAT (0ull) 198#define LOGFS_FEATURES_COMPAT (0ull) 199 200/** 201 * struct logfs_disk_super - on-medium superblock 202 * 203 * @ds_magic: magic number, must equal LOGFS_MAGIC 204 * @ds_crc: crc32 of structure starting with the next field 205 * @ds_ifile_levels: maximum number of levels for ifile 206 * @ds_iblock_levels: maximum number of levels for regular files 207 * @ds_data_levels: number of separate levels for data 208 * @pad0: reserved, must be 0 209 * @ds_feature_incompat: incompatible filesystem features 210 * @ds_feature_ro_compat: read-only compatible filesystem features 211 * @ds_feature_compat: compatible filesystem features 212 * @ds_flags: flags 213 * @ds_segment_shift: log2 of segment size 214 * @ds_block_shift: log2 of block size 215 * @ds_write_shift: log2 of write size 216 * @pad1: reserved, must be 0 217 * @ds_journal_seg: segments used by primary journal 218 * @ds_root_reserve: bytes reserved for the superuser 219 * @ds_speed_reserve: bytes reserved to speed up GC 220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks 221 * @pad2: reserved, must be 0 222 * @pad3: reserved, must be 0 223 * 224 * Contains only read-only fields. Read-write fields like the amount of used 225 * space is tracked in the dynamic superblock, which is stored in the journal. 226 */ 227struct logfs_disk_super { 228 struct logfs_segment_header ds_sh; 229 __be64 ds_magic; 230 231 __be32 ds_crc; 232 __u8 ds_ifile_levels; 233 __u8 ds_iblock_levels; 234 __u8 ds_data_levels; 235 __u8 ds_segment_shift; 236 __u8 ds_block_shift; 237 __u8 ds_write_shift; 238 __u8 pad0[6]; 239 240 __be64 ds_filesystem_size; 241 __be32 ds_segment_size; 242 __be32 ds_bad_seg_reserve; 243 244 __be64 ds_feature_incompat; 245 __be64 ds_feature_ro_compat; 246 247 __be64 ds_feature_compat; 248 __be64 ds_feature_flags; 249 250 __be64 ds_root_reserve; 251 __be64 ds_speed_reserve; 252 253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; 254 255 __be64 ds_super_ofs[2]; 256 __be64 pad3[8]; 257}; 258 259SIZE_CHECK(logfs_disk_super, 256); 260 261/* 262 * Object types: 263 * OBJ_BLOCK - Data or indirect block 264 * OBJ_INODE - Inode 265 * OBJ_DENTRY - Dentry 266 */ 267enum { 268 OBJ_BLOCK = 0x04, 269 OBJ_INODE = 0x05, 270 OBJ_DENTRY = 0x06, 271}; 272 273/** 274 * struct logfs_object_header - per-object header in the ostore 275 * 276 * @crc: crc32 of header, excluding data_crc 277 * @len: length of data 278 * @type: object type, see above 279 * @compr: compression type 280 * @ino: inode number 281 * @bix: block index 282 * @data_crc: crc32 of payload 283 */ 284struct logfs_object_header { 285 __be32 crc; 286 __be16 len; 287 __u8 type; 288 __u8 compr; 289 __be64 ino; 290 __be64 bix; 291 __be32 data_crc; 292} __attribute__((packed)); 293 294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); 295 296/* 297 * Reserved inode numbers: 298 * LOGFS_INO_MASTER - master inode (for inode file) 299 * LOGFS_INO_ROOT - root directory 300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count 301 */ 302enum { 303 LOGFS_INO_MAPPING = 0x00, 304 LOGFS_INO_MASTER = 0x01, 305 LOGFS_INO_ROOT = 0x02, 306 LOGFS_INO_SEGFILE = 0x03, 307 LOGFS_RESERVED_INOS = 0x10, 308}; 309 310/* 311 * Inode flags. High bits should never be written to the medium. They are 312 * reserved for in-memory usage. 313 * Low bits should either remain in sync with the corresponding FS_*_FL or 314 * reuse slots that obviously don't make sense for logfs. 315 * 316 * LOGFS_IF_DIRTY Inode must be written back 317 * LOGFS_IF_ZOMBIE Inode has been deleted 318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode 319 */ 320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ 321#define LOGFS_IF_DIRTY 0x20000000 322#define LOGFS_IF_ZOMBIE 0x40000000 323#define LOGFS_IF_STILLBORN 0x80000000 324 325/* Flags available to chattr */ 326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) 327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) 328/* Flags inherited from parent directory on file/directory creation */ 329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) 330 331/** 332 * struct logfs_disk_inode - on-medium inode 333 * 334 * @di_mode: file mode 335 * @di_pad: reserved, must be 0 336 * @di_flags: inode flags, see above 337 * @di_uid: user id 338 * @di_gid: group id 339 * @di_ctime: change time 340 * @di_mtime: modify time 341 * @di_refcount: reference count (aka nlink or link count) 342 * @di_generation: inode generation, for nfs 343 * @di_used_bytes: number of bytes used 344 * @di_size: file size 345 * @di_data: data pointers 346 */ 347struct logfs_disk_inode { 348 __be16 di_mode; 349 __u8 di_height; 350 __u8 di_pad; 351 __be32 di_flags; 352 __be32 di_uid; 353 __be32 di_gid; 354 355 __be64 di_ctime; 356 __be64 di_mtime; 357 358 __be64 di_atime; 359 __be32 di_refcount; 360 __be32 di_generation; 361 362 __be64 di_used_bytes; 363 __be64 di_size; 364 365 __be64 di_data[LOGFS_EMBEDDED_FIELDS]; 366}; 367 368SIZE_CHECK(logfs_disk_inode, 200); 369 370#define INODE_POINTER_OFS \ 371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) 372#define INODE_USED_OFS \ 373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) 374#define INODE_SIZE_OFS \ 375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) 376#define INODE_HEIGHT_OFS (0) 377 378/** 379 * struct logfs_disk_dentry - on-medium dentry structure 380 * 381 * @ino: inode number 382 * @namelen: length of file name 383 * @type: file type, identical to bits 12..15 of mode 384 * @name: file name 385 */ 386/* FIXME: add 6 bytes of padding to remove the __packed */ 387struct logfs_disk_dentry { 388 __be64 ino; 389 __be16 namelen; 390 __u8 type; 391 __u8 name[LOGFS_MAX_NAMELEN]; 392} __attribute__((packed)); 393 394SIZE_CHECK(logfs_disk_dentry, 266); 395 396#define RESERVED 0xffffffff 397#define BADSEG 0xffffffff 398/** 399 * struct logfs_segment_entry - segment file entry 400 * 401 * @ec_level: erase count and level 402 * @valid: number of valid bytes 403 * 404 * Segment file contains one entry for every segment. ec_level contains the 405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An 406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number 407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the 408 * superblock or the journal, or when the segment is bad. 409 */ 410struct logfs_segment_entry { 411 __be32 ec_level; 412 __be32 valid; 413}; 414 415SIZE_CHECK(logfs_segment_entry, 8); 416 417/** 418 * struct logfs_journal_header - header for journal entries (JEs) 419 * 420 * @h_crc: crc32 of journal entry 421 * @h_len: length of compressed journal entry, 422 * not including header 423 * @h_datalen: length of uncompressed data 424 * @h_type: JE type 425 * @h_compr: compression type 426 * @h_pad: reserved 427 */ 428struct logfs_journal_header { 429 __be32 h_crc; 430 __be16 h_len; 431 __be16 h_datalen; 432 __be16 h_type; 433 __u8 h_compr; 434 __u8 h_pad[5]; 435}; 436 437SIZE_CHECK(logfs_journal_header, 16); 438 439/* 440 * Life expectency of data. 441 * VIM_DEFAULT - default vim 442 * VIM_SEGFILE - for segment file only - very short-living 443 * VIM_GC - GC'd data - likely long-living 444 */ 445enum logfs_vim { 446 VIM_DEFAULT = 0, 447 VIM_SEGFILE = 1, 448}; 449 450/** 451 * struct logfs_je_area - wbuf header 452 * 453 * @segno: segment number of area 454 * @used_bytes: number of bytes already used 455 * @gc_level: GC level 456 * @vim: life expectancy of data 457 * 458 * "Areas" are segments currently being used for writing. There is at least 459 * one area per GC level. Several may be used to separate long-living from 460 * short-living data. If an area with unknown vim is encountered, it can 461 * simply be closed. 462 * The write buffer immediately follow this header. 463 */ 464struct logfs_je_area { 465 __be32 segno; 466 __be32 used_bytes; 467 __u8 gc_level; 468 __u8 vim; 469} __attribute__((packed)); 470 471SIZE_CHECK(logfs_je_area, 10); 472 473#define MAX_JOURNAL_HEADER \ 474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) 475 476/** 477 * struct logfs_je_dynsb - dynamic superblock 478 * 479 * @ds_gec: global erase count 480 * @ds_sweeper: current position of GC "sweeper" 481 * @ds_rename_dir: source directory ino (see dir.c documentation) 482 * @ds_rename_pos: position of source dd (see dir.c documentation) 483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c) 484 * @ds_victim_ino: parent inode of victim (see dir.c) 485 * @ds_used_bytes: number of used bytes 486 */ 487struct logfs_je_dynsb { 488 __be64 ds_gec; 489 __be64 ds_sweeper; 490 491 __be64 ds_rename_dir; 492 __be64 ds_rename_pos; 493 494 __be64 ds_victim_ino; 495 __be64 ds_victim_parent; /* XXX */ 496 497 __be64 ds_used_bytes; 498 __be32 ds_generation; 499 __be32 pad; 500}; 501 502SIZE_CHECK(logfs_je_dynsb, 64); 503 504/** 505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode 506 * 507 * @da_size: size of inode file 508 * @da_last_ino: last created inode 509 * @da_used_bytes: number of bytes used 510 * @da_data: data pointers 511 */ 512struct logfs_je_anchor { 513 __be64 da_size; 514 __be64 da_last_ino; 515 516 __be64 da_used_bytes; 517 u8 da_height; 518 u8 pad[7]; 519 520 __be64 da_data[LOGFS_EMBEDDED_FIELDS]; 521}; 522 523SIZE_CHECK(logfs_je_anchor, 168); 524 525/** 526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) 527 * 528 * @so_segment: segments used for 2nd journal 529 * 530 * Length of the array is given by h_len field in the header. 531 */ 532struct logfs_je_spillout { 533 __be64 so_segment[0]; 534}; 535 536SIZE_CHECK(logfs_je_spillout, 0); 537 538/** 539 * struct logfs_je_journal_ec - erase counts for all journal segments 540 * 541 * @ec: erase count 542 * 543 * Length of the array is given by h_len field in the header. 544 */ 545struct logfs_je_journal_ec { 546 __be32 ec[0]; 547}; 548 549SIZE_CHECK(logfs_je_journal_ec, 0); 550 551/** 552 * struct logfs_je_free_segments - list of free segmetns with erase count 553 */ 554struct logfs_je_free_segments { 555 __be32 segno; 556 __be32 ec; 557}; 558 559SIZE_CHECK(logfs_je_free_segments, 8); 560 561/** 562 * struct logfs_seg_alias - list of segment aliases 563 */ 564struct logfs_seg_alias { 565 __be32 old_segno; 566 __be32 new_segno; 567}; 568 569SIZE_CHECK(logfs_seg_alias, 8); 570 571/** 572 * struct logfs_obj_alias - list of object aliases 573 */ 574struct logfs_obj_alias { 575 __be64 ino; 576 __be64 bix; 577 __be64 val; 578 u8 level; 579 u8 pad[5]; 580 __be16 child_no; 581}; 582 583SIZE_CHECK(logfs_obj_alias, 32); 584 585/** 586 * Compression types. 587 * 588 * COMPR_NONE - uncompressed 589 * COMPR_ZLIB - compressed with zlib 590 */ 591enum { 592 COMPR_NONE = 0, 593 COMPR_ZLIB = 1, 594}; 595 596/* 597 * Journal entries come in groups of 16. First group contains unique 598 * entries, next groups contain one entry per level 599 * 600 * JE_FIRST - smallest possible journal entry number 601 * 602 * JEG_BASE - base group, containing unique entries 603 * JE_COMMIT - commit entry, validates all previous entries 604 * JE_DYNSB - dynamic superblock, anything that ought to be in the 605 * superblock but cannot because it is read-write data 606 * JE_ANCHOR - anchor aka master inode aka inode file's inode 607 * JE_ERASECOUNT erasecounts for all journal segments 608 * JE_SPILLOUT - unused 609 * JE_SEG_ALIAS - aliases segments 610 * JE_AREA - area description 611 * 612 * JE_LAST - largest possible journal entry number 613 */ 614enum { 615 JE_FIRST = 0x01, 616 617 JEG_BASE = 0x00, 618 JE_COMMIT = 0x02, 619 JE_DYNSB = 0x03, 620 JE_ANCHOR = 0x04, 621 JE_ERASECOUNT = 0x05, 622 JE_SPILLOUT = 0x06, 623 JE_OBJ_ALIAS = 0x0d, 624 JE_AREA = 0x0e, 625 626 JE_LAST = 0x0e, 627}; 628 629#endif 630