[Tux3] Two kinds of atomic commit

Previous thread: [Tux3] forward logging and NVRAM by timothy norman huber on Sunday, July 27, 2008 - 5:10 pm. (7 messages)

Next thread: Re: [Tux3] Comparison to Hammer fs design by Daniel Phillips on Monday, July 28, 2008 - 2:39 am. (3 messages)
To: <tux3@...>
Date: Tuesday, November 11, 2008 - 3:17 am

# HG changeset patch
# User OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
# Date 1226387681 -32400
# Node ID 6b615de4d62bea052bb57474e754233f911edf1e
# Parent dab895e2e896189f1764a7ec3330b6473757138e
Use tuxtime() to update timestamp

And this moves time functions to tux3.h to use those without including
inode.c

diff -r dab895e2e896 -r 6b615de4d62b user/dir.c
--- a/user/dir.c Mon Nov 10 20:15:37 2008 -0800
+++ b/user/dir.c Tue Nov 11 16:14:41 2008 +0900
@@ -194,7 +194,7 @@
memcpy(entry->name, name, len);
entry->inum = cpu_to_le32(inum);
entry->type = ext2_type_by_mode[(mode & S_IFMT) >> STAT_SHIFT];
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = tuxtime();
mark_inode_dirty(dir);
offset = (void *)entry - buffer->data;
brelse_dirty(buffer);
diff -r dab895e2e896 -r 6b615de4d62b user/inode.c
--- a/user/inode.c Mon Nov 10 20:15:37 2008 -0800
+++ b/user/inode.c Tue Nov 11 16:14:41 2008 +0900
@@ -16,26 +16,6 @@
#define filemap_included
#include "filemap.c"
#undef main
-
-#include <sys/time.h>
-#include <time.h>
-
-fixed32 tuxtime(void)
-{
- struct timeval now;
- gettimeofday(&now, NULL);
- return tuxtimeval(now.tv_sec, now.tv_usec);
-}
-
-unsigned millionths(fixed32 val)
-{
- return (((val & 0xffffffff) * 1000000) + 0x80000000) >> 32;
-}
-
-u32 high32(fixed32 val)
-{
- return val >> 32;
-}

struct inode *new_inode(SB, inum_t inum)
{
diff -r dab895e2e896 -r 6b615de4d62b user/tux3.h
--- a/user/tux3.h Mon Nov 10 20:15:37 2008 -0800
+++ b/user/tux3.h Tue Nov 11 16:14:41 2008 +0900
@@ -7,6 +7,8 @@
#include <string.h>
#include <inttypes.h>
#include <byteswap.h>
+#include <sys/time.h>
+#include <time.h>
#include "err.h"
#include "trace.h"
#include "buffer.h"
@@ -245,6 +247,23 @@
return ((u64)sec << 32) + ((u64)usec << 32) / 1000000;
}

+static inline fixed32 tuxtime(void)
+{
+ stru...

To: <tux3@...>
Cc: Matthew Dillon <dillon@...>, <kernel@...>
Date: Monday, July 28, 2008 - 1:46 am

Here I describe two slightly different methods that Tux3 will use to
implement atomic commit of data and metadata. Both methods combine
logical and physical forward logging to perform atomic commits
efficiently. The discussion below assumes we are updating a btree leaf,
for example to make room for more inode data or to add pointers to a
file index. The same techniques apply equally well to all structures
in the filesystem.

1) The Update a Clone Method

Instead of directly modifying a disk block corresponding to a btree
leaf, we allocate a new block for the leaf and copy the contents of
the original block to the new block, only in the buffer cache (no copy
is performed on disk). We can now alter the new block at will and
flush it out to disk without affecting the on-disk btree at all. But
have not yet linked the new block into the btree. We could accomplish
that by performing a similar clone update recursively up to the root of
the tree, which creates a new tree root. The whole chain of new blocks
would then be flushed out to disk and a pointer to the new root stored
at some predictable location so it can be found later. This is the
"phase tree" method that I invented for Tux2, and is commonly called
"copy on write" these days. It could also be called the "sue me" method
because Netapp likes to sue those such as Sun who implement it.

Fortunately, there is a better way to do it that I invented recently and
which I have never heard of anyone using before. We modify only the
leaf node of the btree by cloning and record the pointer to the clone
only in the cached btree index block, not on disk. To be able to
reconstruct the cached version of the index node after a crash, we log a
logical change record to the disk that says "write this leaf into that
btree index node".

We make whatever changes we need to the clone of the leaf node, then
construct a two block transaction consisting of the clone and a commit
block. The commit block points at the new leaf node and also ca...

To: <tux3@...>
Date: Tuesday, November 11, 2008 - 3:19 am

# HG changeset patch
# User OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
# Date 1226387746 -32400
# Node ID d680e7bc61353f6c4de5ef24e30b4f5d8245150d
# Parent 6b615de4d62bea052bb57474e754233f911edf1e
Convert more timestamp to high resolution in tux3fuse

diff -r 6b615de4d62b -r d680e7bc6135 user/tux3fuse.c
--- a/user/tux3fuse.c Tue Nov 11 16:14:41 2008 +0900
+++ b/user/tux3fuse.c Tue Nov 11 16:15:46 2008 +0900
@@ -84,9 +84,24 @@
.attr = {
.st_ino = inode->inum,
.st_mode = inode->i_mode,
+#if 1
+ .st_atim = {
+ .tv_sec = high32(inode->i_atime),
+ .tv_nsec = millionths(inode->i_atime) * 1000,
+ },
+ .st_mtim = {
+ .tv_sec = high32(inode->i_mtime),
+ .tv_nsec = millionths(inode->i_mtime) * 1000,
+ },
+ .st_ctim = {
+ .tv_sec = high32(inode->i_ctime),
+ .tv_nsec = millionths(inode->i_ctime) * 1000,
+ },
+#else
.st_atime = high32(inode->i_atime),
.st_mtime = high32(inode->i_mtime),
.st_ctime = high32(inode->i_ctime),
+#endif
.st_size = inode->i_size,
.st_uid = inode->i_uid,
.st_gid = inode->i_gid,
@@ -169,9 +184,24 @@
.attr = {
.st_ino = inode->inum,
.st_mode = inode->i_mode,
+#if 1
+ .st_atim = {
+ .tv_sec = high32(inode->i_atime),
+ .tv_nsec = millionths(inode->i_atime) * 1000,
+ },
+ .st_mtim = {
+ .tv_sec = high32(inode->i_mtime),
+ .tv_nsec = millionths(inode->i_mtime) * 1000,
+ },
+ .st_ctim = {
+ .tv_sec = high32(inode->i_ctime),
+ .tv_nsec = millionths(inode->i_ctime) * 1000,
+ },
+#else
.st_atime = high32(inode->i_atime),
.st_mtime = high32(inode->i_mtime),
.st_ctime = high32(inode->i_ctime),
+#endif
.st_size = inode->i_size,
.st_uid = inode->i_uid,
.st_gid = inode->i_gid,

--
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

_______________________________...

To: <tux3@...>
Date: Tuesday, November 11, 2008 - 3:19 am

# HG changeset patch
# User OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
# Date 1226387771 -32400
# Node ID e50b06dfcea7f1d56578e12e91d50e5de47983cc
# Parent d680e7bc61353f6c4de5ef24e30b4f5d8245150d
Add billionths() and use it, instead of millionths()

diff -r d680e7bc6135 -r e50b06dfcea7 user/tux3.h
--- a/user/tux3.h Tue Nov 11 16:15:46 2008 +0900
+++ b/user/tux3.h Tue Nov 11 16:16:11 2008 +0900
@@ -254,9 +254,9 @@
return tuxtimeval(now.tv_sec, now.tv_usec);
}

-static inline unsigned millionths(fixed32 val)
+static inline unsigned billionths(fixed32 val)
{
- return (((val & 0xffffffff) * 1000000) + 0x80000000) >> 32;
+ return ((((val & 0xffffffff) * 1000000) + 0x80000000) >> 32) * 1000;
}

static inline u32 high32(fixed32 val)
diff -r d680e7bc6135 -r e50b06dfcea7 user/tux3fuse.c
--- a/user/tux3fuse.c Tue Nov 11 16:15:46 2008 +0900
+++ b/user/tux3fuse.c Tue Nov 11 16:16:11 2008 +0900
@@ -87,15 +87,15 @@
#if 1
.st_atim = {
.tv_sec = high32(inode->i_atime),
- .tv_nsec = millionths(inode->i_atime) * 1000,
+ .tv_nsec = billionths(inode->i_atime),
},
.st_mtim = {
.tv_sec = high32(inode->i_mtime),
- .tv_nsec = millionths(inode->i_mtime) * 1000,
+ .tv_nsec = billionths(inode->i_mtime),
},
.st_ctim = {
.tv_sec = high32(inode->i_ctime),
- .tv_nsec = millionths(inode->i_ctime) * 1000,
+ .tv_nsec = billionths(inode->i_ctime),
},
#else
.st_atime = high32(inode->i_atime),
@@ -187,15 +187,15 @@
#if 1
.st_atim = {
.tv_sec = high32(inode->i_atime),
- .tv_nsec = millionths(inode->i_atime) * 1000,
+ .tv_nsec = billionths(inode->i_atime),
},
.st_mtim = {
.tv_sec = high32(inode->i_mtime),
- .tv_nsec = millionths(inode->i_mtime) * 1000,
+ .tv_nsec = billionths(inode->i_mtime),
},
.st_ctim = {
.tv_sec = high32(inode->i_ctime),
- .tv_nsec ...

Previous thread: [Tux3] forward logging and NVRAM by timothy norman huber on Sunday, July 27, 2008 - 5:10 pm. (7 messages)

Next thread: Re: [Tux3] Comparison to Hammer fs design by Daniel Phillips on Monday, July 28, 2008 - 2:39 am. (3 messages)