Filesystem Management

Filesystem Abstractions

Filesystem Abstractions - in memory


Filesystem Abstractions - on storage


Simple filesystem example





Filesystem Operations

Mounting a filesystem

Opening a file

Querying file attributes

Reading data from a file

Writing data to a file

Closing a file


Directories are special files which contain one or more dentries.

Creating a file

Deleting a file

Virtual File System


Superblock Operations

  • fill_super
  • put_super
  • write_super
  • read_inode
  • write_inode
  • evict_inode
  • statfs
  • remount_fs

Inode Operations

  • create
  • lookup
  • link
  • unlink
  • symlink
  • mkdir
  • rmdir
  • rename
  • readlink
  • follow_link
  • put_link
  • ...

The Inode Cache

The Dentry Cache

The Page Cache

struct address_space

 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @private_lock: For use by the owner of the address_space.
 * @private_list: For use by the owner of the address_space.
 * @private_data: For use by the owner of the address_space.
struct address_space {
  struct inode            *host;
  struct xarray           i_pages;
  gfp_t                   gfp_mask;
  atomic_t                i_mmap_writable;
  /* number of thp, only for non-shmem files */
  atomic_t                nr_thps;
  struct rb_root_cached   i_mmap;
  struct rw_semaphore     i_mmap_rwsem;
  unsigned long           nrpages;
  unsigned long           nrexceptional;
  pgoff_t                 writeback_index;
  const struct address_space_operations *a_ops;
  unsigned long           flags;
  errseq_t                wb_err;
  spinlock_t              private_lock;
  struct list_head        private_list;
  void                    *private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;

struct address_space_operations {
  int (*writepage)(struct page *page, struct writeback_control *wbc);
  int (*readpage)(struct file *, struct page *);

  /* Write back some dirty pages from this mapping. */
  int (*writepages)(struct address_space *, struct writeback_control *);

  /* Set a page dirty.  Return true if this dirtied it */
  int (*set_page_dirty)(struct page *page);

   * Reads in the requested pages. Unlike ->readpage(), this is
   * PURELY used for read-ahead!.
  int (*readpages)(struct file *filp, struct address_space *mapping,
                  struct list_head *pages, unsigned nr_pages);
  void (*readahead)(struct readahead_control *);

  int (*write_begin)(struct file *, struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned flags,
                          struct page **pagep, void **fsdata);
  int (*write_end)(struct file *, struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct page *page, void *fsdata);

  /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
  sector_t (*bmap)(struct address_space *, sector_t);
  void (*invalidatepage) (struct page *, unsigned int, unsigned int);
  int (*releasepage) (struct page *, gfp_t);
  void (*freepage)(struct page *);
  ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
   * migrate the contents of a page to the specified target. If
   * migrate_mode is MIGRATE_ASYNC, it must not block.
  int (*migratepage) (struct address_space *,
                  struct page *, struct page *, enum migrate_mode);
  bool (*isolate_page)(struct page *, isolate_mode_t);
  void (*putback_page)(struct page *);
  int (*launder_page) (struct page *);
  int (*is_partially_uptodate) (struct page *, unsigned long,
                                  unsigned long);
  void (*is_dirty_writeback) (struct page *, bool *, bool *);
  int (*error_remove_page)(struct address_space *, struct page *);

  /* swapfile support */
  int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                          sector_t *span);
  void (*swap_deactivate)(struct file *file);

Reading data

 * generic_file_read_iter - generic filesystem read routine
 * @iocb: kernel I/O control block
 * @iter: destination for the data read
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)

 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * page struct once IO has completed.
int block_read_full_page(struct page *page, get_block_t *get_block)