forked from cloudius-systems/osv
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemention of a read-only file system for OSv and modifications to …
…the build system and VFS to allow using the new file system as the root file system. There are a few reasons why this is an important addition to OSv: 1. A read-only root file system allows the user to force the hypervisor to not allow writing to disk. This means that you can boot multiple intances off of a single image file. A use case for this feature is something like Redis, allowing you to only have to maintain a single image file. 2. An immutable image will always have the same hash signature. This will allow the user of the image to verify that the image file has not been tampered with, even after the image has been run. 3. From Nadav Har'El: "it allows building an OSv image *without* running it, which solves all sorts of nasty corner cases in OSv's current scripts/build, when it is not desired to *run* the image during build. For example, cross-compilation, building on a VM without efficient nested virtualization, etc." The file system itself is titled MFS (originally standing for My File System, but the name is not very important). It has a limited set of features, but the reduced feature set allows the implementation and the layout of the blocks to be very simple. Currently MFS supports regular files/directories and symbolic links. File and directory names are limited to 63 characters and link paths are limited to 511 characters. The layout of the file system is as follows: Block 0: The superblock. This stores identifing information about the intstance of MFS as well as the Block ID for Inode cloudius-systems#1. Block 1 - N: The next set of blocks store files, directory entires, and links. *Everything* in MFS is stored sequentially, meaning that there will be no fragmentation. Also, files can be referenced by knowing their starting block id as well as their length. Block N - End: The last set of blocks stores the inodes, in sequential order. Due to being ordered, it is possible to know the location of any inode given only its number. Based on conversations with Nadav, I went back and forth about implementing a caching mechanism in the MFS driver. After some testing, I opted to not include a caching mechanism for the following reasons: 1. The block I/O system (fs/vfs/vfs_bio.h) already has a cache built in when using the bread function. 2. Through testing with the redis-memonly image, I found that over 96% (1132/1162) of file reads are first time reads of a block, meaning that it could not have been cached (without a much more complex pre-caching system). Of blocks that were read multiple times, the block i/o system had an 80% cache hit rate (31/39). This seems to be sufficient for the time being. Tests were run with MFS as the root file system and multiple runs of redis-benchmark against a single running image. The build system and OSv modifications were extremly minimal. An option was added to scripts/build titled rootfs that allows specifying mfs or zfs to be used (with zfs as the default). Once OSV is running, it will attempt to mount the root file system as MFS every time. Upon failure (in the case of ZFS being the root file system), it will fall back to trying to mount the file system as ZFS. Currently, I do not believe it is possible to have both ZFS and MFS running in the same image (zpool is not included in MFS images) but this should be implemented in the future. Signed-off-by: James Root <[email protected]>
- Loading branch information
Showing
11 changed files
with
1,119 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* Copyright (c) 2015 Carnegie Mellon University. | ||
* All Rights Reserved. | ||
* | ||
* THIS SOFTWARE IS PROVIDED "AS IS," WITH NO WARRANTIES WHATSOEVER. CARNEGIE | ||
* MELLON UNIVERSITY EXPRESSLY DISCLAIMS TO THE FULLEST EXTENT PERMITTEDBY LAW | ||
* ALL EXPRESS, IMPLIED, AND STATUTORY WARRANTIES, INCLUDING, WITHOUT | ||
* LIMITATION, THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR | ||
* PURPOSE, AND NON-INFRINGEMENT OF PROPRIETARY RIGHTS. | ||
* | ||
* Released under a modified BSD license. For full terms, please see mfs.txt in | ||
* the licenses folder or contact [email protected]. | ||
* | ||
* DM-0002621 | ||
* | ||
* Based on https://github.com/jdroot/mfs | ||
*/ | ||
|
||
#ifndef __INCLUDE_MFS_H__ | ||
#define __INCLUDE_MFS_H__ | ||
|
||
#include <osv/vnode.h> | ||
#include <osv/mount.h> | ||
#include <osv/dentry.h> | ||
#include <osv/prex.h> | ||
#include <osv/buf.h> | ||
|
||
#define MFS_VERSION 1 | ||
#define MFS_MAGIC 0xDEADBEEF | ||
#define MFS_FILENAME_MAXLEN 63 | ||
#define MFS_ROOT_INODE_NUMBER 1 | ||
|
||
#define MFS_SUPERBLOCK_SIZE sizeof(struct mfs_super_block) | ||
#define MFS_SUPERBLOCK_BLOCK 0 | ||
|
||
|
||
#define MFS_INODE_SIZE ((uint64_t)sizeof(struct mfs_inode)) | ||
#define MFS_INODES_PER_BLOCK(bs) ((bs) / MFS_INODE_SIZE) | ||
#define MFS_INODE_BLOCK(bs, i) ((i) / MFS_INODES_PER_BLOCK(bs)) | ||
#define MFS_INODE_OFFSET(bs, i) ((i) % MFS_INODES_PER_BLOCK(bs)) | ||
|
||
|
||
#define MFS_RECORD_SIZE (uint64_t)sizeof(struct mfs_dir_record) | ||
#define MFS_RECORDS_PER_BLOCK(bs) ((bs) / MFS_RECORD_SIZE) | ||
#define MFS_RECORD_BLOCK(bs, i) ((i) / MFS_RECORDS_PER_BLOCK(bs)) | ||
#define MFS_RECORD_OFFSET(bs, i) ((i) % (MFS_RECORDS_PER_BLOCK(bs))) | ||
|
||
|
||
#define MFS_CACHE_SIZE 1024 | ||
|
||
|
||
#if 0 | ||
#define print(...) kprintf(__VA_ARGS__) | ||
#else | ||
#define print(...) | ||
#endif | ||
|
||
extern struct vfsops mfs_vfsops; | ||
extern struct vnops mfs_vnops; | ||
|
||
struct mfs_super_block { | ||
uint64_t magic; | ||
uint64_t version; | ||
uint64_t block_size; | ||
uint64_t inodes_block; | ||
}; | ||
|
||
|
||
struct mfs_inode { | ||
mode_t mode; | ||
uint64_t inode_no; | ||
uint64_t data_block_number; | ||
union { | ||
uint64_t file_size; | ||
uint64_t dir_children_count; | ||
}; | ||
}; | ||
|
||
struct mfs_dir_record { | ||
// Add one for \0 | ||
char filename[MFS_FILENAME_MAXLEN + 1]; | ||
uint64_t inode_no; | ||
}; | ||
|
||
// FIXME: The code is setup so a cache can be added pretty quickly if needed, but the | ||
// underlying bread function is already cached. To add a cache, the structure would be | ||
// added here, and then mfs_cache_read and mfs_cache_write would use it | ||
struct mfs { | ||
struct mfs_super_block *sb; | ||
}; | ||
|
||
struct mfs_inode *mfs_get_inode(struct mfs *mfs, struct device *dev, uint64_t inode_no); | ||
void mfs_set_vnode(struct vnode* vnode, struct mfs_inode *inode); | ||
|
||
int mfs_cache_read(struct mfs *mfs, struct device *device, uint64_t blkid, struct buf **bh); | ||
void mfs_cache_release(struct mfs *mfs, struct buf *bh); | ||
|
||
#endif | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Copyright (c) 2015 Carnegie Mellon University. | ||
* All Rights Reserved. | ||
* | ||
* THIS SOFTWARE IS PROVIDED "AS IS," WITH NO WARRANTIES WHATSOEVER. CARNEGIE | ||
* MELLON UNIVERSITY EXPRESSLY DISCLAIMS TO THE FULLEST EXTENT PERMITTEDBY LAW | ||
* ALL EXPRESS, IMPLIED, AND STATUTORY WARRANTIES, INCLUDING, WITHOUT | ||
* LIMITATION, THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR | ||
* PURPOSE, AND NON-INFRINGEMENT OF PROPRIETARY RIGHTS. | ||
* | ||
* Released under a modified BSD license. For full terms, please see mfs.txt in | ||
* the licenses folder or contact [email protected]. | ||
* | ||
* DM-0002621 | ||
* | ||
* Based on https://github.com/jdroot/mfs | ||
*/ | ||
|
||
#include "mfs.hh" | ||
|
||
int mfs_cache_read(struct mfs *mfs, struct device *device, uint64_t blkid, struct buf **bh) { | ||
return bread(device, blkid, bh); | ||
} | ||
|
||
void mfs_cache_release(struct mfs *mfs, struct buf *bh) { | ||
brelse(bh); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
/* | ||
* Copyright (c) 2015 Carnegie Mellon University. | ||
* All Rights Reserved. | ||
* | ||
* THIS SOFTWARE IS PROVIDED "AS IS," WITH NO WARRANTIES WHATSOEVER. CARNEGIE | ||
* MELLON UNIVERSITY EXPRESSLY DISCLAIMS TO THE FULLEST EXTENT PERMITTEDBY LAW | ||
* ALL EXPRESS, IMPLIED, AND STATUTORY WARRANTIES, INCLUDING, WITHOUT | ||
* LIMITATION, THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR | ||
* PURPOSE, AND NON-INFRINGEMENT OF PROPRIETARY RIGHTS. | ||
* | ||
* Released under a modified BSD license. For full terms, please see mfs.txt in | ||
* the licenses folder or contact [email protected]. | ||
* | ||
* DM-0002621 | ||
* | ||
* Based on https://github.com/jdroot/mfs | ||
*/ | ||
|
||
#include "mfs.hh" | ||
|
||
#include <stdio.h> | ||
#include <sys/types.h> | ||
#include <osv/device.h> | ||
#include <osv/buf.h> | ||
#include <osv/debug.h> | ||
|
||
struct mfs_inode *mfs_get_inode(struct mfs *mfs, struct device *dev, uint64_t inode_no) { | ||
struct mfs_super_block *sb = mfs->sb; | ||
struct mfs_inode *inode = nullptr; | ||
struct mfs_inode *rv = nullptr; | ||
struct buf *bh = nullptr; | ||
|
||
uint64_t i = inode_no - 1; | ||
int error = -1; | ||
uint64_t inode_block = sb->inodes_block; | ||
uint64_t inode_offset = 0; | ||
|
||
inode_block += MFS_INODE_BLOCK(sb->block_size, i); | ||
inode_offset = MFS_INODE_OFFSET(sb->block_size, i); | ||
|
||
print("[mfs] looking for inode %llu in block %llu\n", inode_no, inode_block); | ||
|
||
error = mfs_cache_read(mfs, dev, inode_block, &bh); | ||
if (error) { | ||
kprintf("[mfs] Error reading block [%llu]\n", inode_block); | ||
return nullptr; | ||
} | ||
|
||
inode = (struct mfs_inode *)bh->b_data; | ||
inode += inode_offset; | ||
|
||
print("[mfs] got inode_no = %llu\n", inode->inode_no); | ||
|
||
// Assert is somewhat dangerous here, but if this assert fails the filesystem | ||
// has been corrupted somehow. | ||
assert(inode->inode_no == inode_no); | ||
|
||
rv = new mfs_inode; | ||
memcpy(rv, inode, sizeof(struct mfs_inode)); | ||
|
||
mfs_cache_release(mfs, bh); | ||
|
||
return rv; | ||
} | ||
|
||
void mfs_set_vnode(struct vnode* vnode, struct mfs_inode *inode) { | ||
off_t size = 0; | ||
if (vnode == nullptr || inode == nullptr) { | ||
return; | ||
} | ||
|
||
vnode->v_data = inode; | ||
vnode->v_ino = inode->inode_no; | ||
|
||
// Set type | ||
if (S_ISDIR(inode->mode)) { | ||
size = MFS_INODE_SIZE; | ||
vnode->v_type = VDIR; | ||
} else if (S_ISREG(inode->mode)) { | ||
size = inode->file_size; | ||
vnode->v_type = VREG; | ||
} else if (S_ISLNK(inode->mode)) { | ||
size = 512; // Max size | ||
vnode->v_type = VLNK; | ||
} | ||
|
||
vnode->v_mode = inode->mode; | ||
vnode->v_size = size; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
/* | ||
* Copyright (c) 2015 Carnegie Mellon University. | ||
* All Rights Reserved. | ||
* | ||
* THIS SOFTWARE IS PROVIDED "AS IS," WITH NO WARRANTIES WHATSOEVER. CARNEGIE | ||
* MELLON UNIVERSITY EXPRESSLY DISCLAIMS TO THE FULLEST EXTENT PERMITTEDBY LAW | ||
* ALL EXPRESS, IMPLIED, AND STATUTORY WARRANTIES, INCLUDING, WITHOUT | ||
* LIMITATION, THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR | ||
* PURPOSE, AND NON-INFRINGEMENT OF PROPRIETARY RIGHTS. | ||
* | ||
* Released under a modified BSD license. For full terms, please see mfs.txt in | ||
* the licenses folder or contact [email protected]. | ||
* | ||
* DM-0002621 | ||
* | ||
* Based on https://github.com/jdroot/mfs | ||
*/ | ||
|
||
#include "mfs.hh" | ||
#include <stdio.h> | ||
#include <sys/types.h> | ||
#include <osv/device.h> | ||
#include <osv/debug.h> | ||
|
||
static int mfs_mount(struct mount *mp, const char *dev, int flags, const void *data); | ||
static int mfs_sync(struct mount *mp); | ||
static int mfs_statfs(struct mount *mp, struct statfs *statp); | ||
static int mfs_unmount(struct mount *mp, int flags); | ||
|
||
#define ramfs_vget ((vfsop_vget_t)vfs_nullop) | ||
#define ramfs_statfs ((vfsop_statfs_t)vfs_nullop) | ||
|
||
struct vfsops mfs_vfsops = { | ||
mfs_mount, /* mount */ | ||
mfs_unmount, /* unmount */ | ||
mfs_sync, /* sync */ | ||
((vfsop_vget_t)vfs_nullop), /* vget */ | ||
mfs_statfs, /* statfs */ | ||
&mfs_vnops, /* vnops */ | ||
}; | ||
|
||
static int | ||
mfs_mount(struct mount *mp, const char *dev, int flags, const void *data) { | ||
struct device *device; | ||
struct buf *bh = nullptr; | ||
struct mfs *mfs = new struct mfs; | ||
struct mfs_super_block *sb = nullptr; | ||
struct mfs_inode *root_inode = nullptr; | ||
int error = -1; | ||
|
||
error = device_open(dev + 5, DO_RDWR, &device); | ||
if (error) { | ||
kprintf("[mfs] Error opening device!\n"); | ||
return error; | ||
} | ||
|
||
|
||
error = mfs_cache_read(mfs, device, MFS_SUPERBLOCK_BLOCK, &bh); | ||
if (error) { | ||
kprintf("[mfs] Error reading mfs superblock\n"); | ||
device_close(device); | ||
delete mfs; | ||
return error; | ||
} | ||
|
||
// We see if the file system is MFS, if not, return error and close everything | ||
sb = (struct mfs_super_block*)bh->b_data; | ||
if (sb->magic != MFS_MAGIC) { | ||
print("[mfs] Error magics do not match!\n"); | ||
print("[mfs] Expecting %016llX but got %016llX\n", MFS_MAGIC, sb->magic); | ||
mfs_cache_release(mfs, bh); | ||
device_close(device); | ||
delete mfs; | ||
return -1; // TODO: Proper error code | ||
} | ||
|
||
if (sb->version != MFS_VERSION) { | ||
kprintf("[mfs] Found mfs volume but incompatible version!\n"); | ||
kprintf("[mfs] Expecting %llu but found %llu\n", MFS_VERSION, sb->version); | ||
mfs_cache_release(mfs, bh); | ||
device_close(device); | ||
delete mfs; | ||
return -1; | ||
} | ||
|
||
print("[mfs] Got superblock version: 0x%016llX\n", sb->version); | ||
print("[mfs] Got magic: 0x%016llX\n", sb->magic); | ||
print("[mfs] Got block size: 0x%016llX\n", sb->block_size); | ||
print("[mfs] Got inode block: 0x%016llX\n", sb->inodes_block); | ||
|
||
// Since we have found MFS, we can copy the superblock now | ||
sb = new mfs_super_block; | ||
memcpy(sb, bh->b_data, MFS_SUPERBLOCK_SIZE); | ||
mfs_cache_release(mfs, bh); | ||
|
||
mfs->sb = sb; | ||
|
||
// Save a reference to our superblock | ||
mp->m_data = mfs; | ||
mp->m_dev = device; | ||
|
||
root_inode = mfs_get_inode(mfs, device, MFS_ROOT_INODE_NUMBER); | ||
|
||
mfs_set_vnode(mp->m_root->d_vnode, root_inode); | ||
|
||
return 0; | ||
} | ||
|
||
static int mfs_sync(struct mount *mp) { | ||
return 0; | ||
} | ||
|
||
static int mfs_statfs(struct mount *mp, struct statfs *statp) { | ||
struct mfs *mfs = (struct mfs*)mp->m_data; | ||
struct mfs_super_block *sb = mfs->sb; | ||
|
||
statp->f_bsize = sb->block_size; | ||
|
||
// Total blocks, unknown... | ||
statp->f_blocks = sb->inodes_block; | ||
// Read only. 0 blocks free | ||
statp->f_bfree = 0; | ||
statp->f_bavail = 0; | ||
|
||
statp->f_ffree = 0; | ||
statp->f_files = sb->inodes_block; //Needs to be inode count | ||
|
||
statp->f_namelen = MFS_FILENAME_MAXLEN; | ||
|
||
return 0; | ||
} | ||
|
||
static int | ||
mfs_unmount(struct mount *mp, int flags) { | ||
struct mfs *mfs = (struct mfs*)mp->m_data; | ||
struct mfs_super_block *sb = mfs->sb; | ||
struct device *dev = mp->m_dev; | ||
|
||
device_close(dev); | ||
delete sb; | ||
delete mfs; | ||
|
||
return 0; | ||
} |
Oops, something went wrong.