Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/super.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/super.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/super.c
@@ -439,6 +439,8 @@ static void ext4_put_super (struct super
 	struct ext4_super_block *es = sbi->s_es;
 	int i;
 
+	ext4_wb_release(sb);
+	ext4_reserve_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	jbd2_journal_destroy(sbi->s_journal);
@@ -505,6 +507,13 @@ static struct inode *ext4_alloc_inode(st
 	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+
+	/* FIXME: these wb-related fields could be initialized once */
+	ei->i_blocks_reserved = 0;
+	ei->i_md_reserved = 0;
+	atomic_set(&ei->i_wb_writers, 0);
+	spin_lock_init(&ei->i_wb_reserved_lock);
+
 	return &ei->vfs_inode;
 }
 
@@ -725,7 +734,8 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents,
+	Opt_grpquota, Opt_extents, Opt_noextents, Opt_delayed_alloc,
+	Opt_nodelayed_alloc,
 };
 
 static match_table_t tokens = {
@@ -776,6 +786,9 @@ static match_table_t tokens = {
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
 	{Opt_extents, "extents"},
+	{Opt_noextents, "noextents"},
+	{Opt_delayed_alloc, "delalloc"},
+	{Opt_nodelayed_alloc, "nodelalloc"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -1090,6 +1103,12 @@ clear_qf_name:
 			else
 				clear_opt(sbi->s_mount_opt, BARRIER);
 			break;
+		case Opt_delayed_alloc:
+			set_opt(sbi->s_mount_opt, DELAYED_ALLOC);
+			break;
+		case Opt_nodelayed_alloc:
+			clear_opt(sbi->s_mount_opt, DELAYED_ALLOC);
+			break;
 		case Opt_ignore:
 			break;
 		case Opt_resize:
@@ -1111,6 +1130,9 @@ clear_qf_name:
 		case Opt_extents:
 			set_opt (sbi->s_mount_opt, EXTENTS);
 			break;
+		case Opt_noextents:
+			clear_opt (sbi->s_mount_opt, EXTENTS);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1542,6 +1564,12 @@ static int ext4_fill_super (struct super
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 
+	/*
+	 * turn on extents feature by default in ext4 filesystem
+	 * User -o noextents to turn it off
+	 */
+	set_opt (sbi->s_mount_opt, EXTENTS);
+
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
 			    NULL, 0))
 		goto failed_mount;
@@ -1632,6 +1660,8 @@ static int ext4_fill_super (struct super
 				sbi->s_inode_size);
 			goto failed_mount;
 		}
+		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
+			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
 	sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
 				   le32_to_cpu(es->s_log_frag_size);
@@ -1848,6 +1878,32 @@ static int ext4_fill_super (struct super
 	}
 
 	ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+
+	/* determine the minimum size of new large inodes, if present */
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						     EXT4_GOOD_OLD_INODE_SIZE;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_want_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_want_extra_isize);
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_min_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_min_extra_isize);
+		}
+	}
+	/* Check if enough inode space is available */
+	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+							sbi->s_inode_size) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						       EXT4_GOOD_OLD_INODE_SIZE;
+		printk(KERN_INFO "EXT4-fs: required extra inode space not"
+			"available.\n");
+	}
+
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -1868,6 +1924,8 @@ static int ext4_fill_super (struct super
 		"writeback");
 
 	ext4_ext_init(sb);
+	ext4_reserve_init(sb);
+	ext4_wb_init(sb);
 
 	lock_kernel();
 	return 0;
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/inode.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/inode.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/inode.c
@@ -726,7 +726,7 @@ static int ext4_splice_branch(handle_t *
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 
-	inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 	/* had we spliced it onto indirect block? */
@@ -942,7 +942,7 @@ out:
 
 #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
 
-static int ext4_get_block(struct inode *inode, sector_t iblock,
+int ext4_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
@@ -1741,9 +1741,34 @@ static const struct address_space_operat
 	.releasepage	= ext4_releasepage,
 };
 
+static int ext4_wb_set_page_dirty(struct page *page)
+{
+	return __set_page_dirty_nobuffers(page);
+}
+
+static struct address_space_operations ext4_writeback_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_wb_writepage,
+	.writepages	= ext4_wb_writepages,
+	.sync_page	= block_sync_page,
+	.prepare_write	= ext4_wb_prepare_write,
+	.commit_write	= ext4_wb_commit_write,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_wb_invalidatepage,
+	.releasepage	= ext4_wb_releasepage,
+	.set_page_dirty	= ext4_wb_set_page_dirty,
+	.direct_IO	= ext4_direct_IO,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
+	if (S_ISREG(inode->i_mode) &&
+			(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+			test_opt(inode->i_sb, EXTENTS) &&
+			test_opt(inode->i_sb, DELAYED_ALLOC))
+		inode->i_mapping->a_ops = &ext4_writeback_da_aops;
+	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
@@ -1768,6 +1793,11 @@ int ext4_block_truncate_page(handle_t *h
 	int err = 0;
 	void *kaddr;
 
+	if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+			test_opt(inode->i_sb, EXTENTS) &&
+			test_opt(inode->i_sb, DELAYED_ALLOC))
+		return ext4_wb_block_truncate_page(handle, page, mapping, from);
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -2375,7 +2405,7 @@ do_indirects:
 	ext4_discard_reservation(inode);
 
 	mutex_unlock(&ei->truncate_mutex);
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 	/*
@@ -2583,6 +2613,25 @@ void ext4_set_inode_flags(struct inode *
 		inode->i_flags |= S_DIRSYNC;
 }
 
+/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
+void ext4_get_inode_flags(struct ext4_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+			EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT4_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT4_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT4_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT4_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT4_DIRSYNC_FL;
+}
+
 void ext4_read_inode(struct inode * inode)
 {
 	struct ext4_iloc iloc;
@@ -2610,10 +2659,6 @@ void ext4_read_inode(struct inode * inod
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
-	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
 	ei->i_state = 0;
 	ei->i_dir_start_lookup = 0;
@@ -2689,6 +2734,18 @@ void ext4_read_inode(struct inode * inod
 	} else
 		ei->i_extra_isize = 0;
 
+	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+
+	ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+			ei->i_fs_version |=
+			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+	}
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
@@ -2742,6 +2799,7 @@ static int ext4_do_update_inode(handle_t
 	if (ei->i_state & EXT4_STATE_NEW)
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
+	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if(!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
@@ -2769,9 +2827,12 @@ static int ext4_do_update_inode(handle_t
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
-	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+
+	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
@@ -2828,8 +2889,14 @@ static int ext4_do_update_inode(handle_t
 	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
 		raw_inode->i_block[block] = ei->i_data[block];
 
-	if (ei->i_extra_isize)
+	raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version);
+	if (ei->i_extra_isize) {
+		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) {
+			raw_inode->i_version_hi =
+				cpu_to_le32(ei->i_fs_version >> 32);
+		}
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+	}
 
 	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 	rc = ext4_journal_dirty_metadata(handle, bh);
@@ -3080,6 +3147,40 @@ ext4_reserve_inode_write(handle_t *handl
 }
 
 /*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
+                        struct ext4_iloc iloc, handle_t *handle)
+{
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+		return 0;
+	}
+
+	raw_inode = ext4_raw_inode(&iloc);
+
+	header = IHDR(inode, raw_inode);
+        entry = IFIRST(header);
+
+	/* No extended attributes present */
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+			new_extra_isize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		return 0;
+	}
+
+	/* try to expand with EA present */
+	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+						raw_inode, handle);
+}
+
+/*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
@@ -3103,10 +3204,32 @@ ext4_reserve_inode_write(handle_t *handl
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
-	int err;
+	int err, ret;
+	static int expand_message;
 
 	might_sleep();
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (EXT4_I(inode)->i_extra_isize <
+	    EXT4_SB(inode->i_sb)->s_want_extra_isize &&
+	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+		/* We need extra buffer credits since we may write into EA block
+		 * with this same handle */
+		if ((jbd2_journal_extend(handle,
+			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+			ret = ext4_expand_extra_isize(inode,
+  					EXT4_SB(inode->i_sb)->s_want_extra_isize,
+					iloc, handle);
+			if (ret) {
+				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+				if (!expand_message) {
+					ext4_warning(inode->i_sb, __FUNCTION__,
+					"Unable to expand inode %lu. Delete some"
+					" EAs or run e2fsck.", inode->i_ino);
+					expand_message = 1;
+				}
+			}
+		}
+	}
 	if (!err)
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/ioctl.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/ioctl.c
@@ -28,6 +28,7 @@ int ext4_ioctl (struct inode * inode, st
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
+ 		ext4_get_inode_flags(ei);
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
@@ -96,7 +97,7 @@ int ext4_ioctl (struct inode * inode, st
 		ei->i_flags = flags;
 
 		ext4_set_inode_flags(inode);
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = ext4_current_time(inode);
 
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
@@ -133,7 +134,7 @@ flags_err:
 			return PTR_ERR(handle);
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
-			inode->i_ctime = CURRENT_TIME_SEC;
+			inode->i_ctime = ext4_current_time(inode);
 			inode->i_generation = generation;
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
Index: linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/ext4_fs.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs.h
@@ -71,7 +71,7 @@
 /*
  * Maximal count of links to a file
  */
-#define EXT4_LINK_MAX		32000
+#define EXT4_LINK_MAX		65000
 
 /*
  * Macro-instructions used to manage several block sizes
@@ -102,6 +102,7 @@
 				 EXT4_GOOD_OLD_FIRST_INO : \
 				 (s)->s_first_ino)
 #endif
+#define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size),(1 << (blkbits)))
 
 /*
  * Macro-instructions used to manage fragments
@@ -201,6 +202,8 @@ struct ext4_group_desc
 #define EXT4_STATE_JDATA		0x00000001 /* journaled data exists */
 #define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
+#define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
+#define EXT4_STATE_BLOCKS_RESERVED	0x00000010 /* blocks reserved */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -225,6 +228,11 @@ struct ext4_new_group_data {
 	__u32 free_blocks_count;
 };
 
+/*
+ * Following is used by preallocation code to tell get_blocks() that we
+ * want uninitialzed extents.
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT		2
 
 /*
  * ioctl commands
@@ -282,7 +290,7 @@ struct ext4_inode {
 	__le16	i_uid;		/* Low 16 bits of Owner Uid */
 	__le32	i_size;		/* Size in bytes */
 	__le32	i_atime;	/* Access time */
-	__le32	i_ctime;	/* Creation time */
+	__le32	i_ctime;	/* Inode Change time */
 	__le32	i_mtime;	/* Modification time */
 	__le32	i_dtime;	/* Deletion Time */
 	__le16	i_gid;		/* Low 16 bits of Group Id */
@@ -291,7 +299,7 @@ struct ext4_inode {
 	__le32	i_flags;	/* File flags */
 	union {
 		struct {
-			__u32  l_i_reserved1;
+			__u32  l_i_version;
 		} linux1;
 		struct {
 			__u32  h_i_translator;
@@ -331,10 +339,76 @@ struct ext4_inode {
 	} osd2;				/* OS dependent 2 */
 	__le16	i_extra_isize;
 	__le16	i_pad1;
+	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
+	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
+	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
+	__le32  i_crtime;       /* File Creation time */
+	__le32  i_crtime_extra; /* extra File Creation time (nsec << 2 | epoch) */
+	__le32   i_version_hi;   /* high 32 bits for 64-bit version */
 };
 
 #define i_size_high	i_dir_acl
 
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
+
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
+	((offsetof(typeof(*ext4_inode), field) +	\
+	  sizeof((ext4_inode)->field))			\
+	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
+	    (einode)->i_extra_isize))			\
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+			   time->tv_sec >> 32 : 0) |
+			   ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) {
+       if (sizeof(time->tv_sec) > 4)
+	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+			       << 32;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(inode)->xtime);       \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(einode)->xtime);      \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		ext4_decode_extra_time(&(inode)->xtime,			       \
+				       raw_inode->xtime ## _extra);	       \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		ext4_decode_extra_time(&(einode)->xtime,		       \
+				       raw_inode->xtime ## _extra);	       \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
 #if defined(__KERNEL__) || defined(__linux__)
 #define i_reserved1	osd1.linux1.l_i_reserved1
 #define i_frag		osd2.linux2.l_i_frag
@@ -400,6 +474,7 @@ struct ext4_inode {
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
 #define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
+#define EXT4_MOUNT_DELAYED_ALLOC	0x1000000/* Delayed allocation support */
 
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -533,6 +608,13 @@ static inline struct ext4_inode_info *EX
 	return container_of(inode, struct ext4_inode_info, vfs_inode);
 }
 
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
@@ -603,6 +685,8 @@ static inline int ext4_valid_inum(struct
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -620,6 +704,8 @@ static inline int ext4_valid_inum(struct
 					 EXT4_FEATURE_INCOMPAT_64BIT)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \
+					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE| \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 
 /*
@@ -815,6 +901,10 @@ extern struct ext4_group_desc * ext4_get
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 extern void ext4_init_block_alloc_info(struct inode *);
 extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
+int ext4_reserve_init(struct super_block *sb);
+void ext4_reserve_release(struct super_block *sb);
+void ext4_release_blocks(struct super_block *sb, int blocks);
+int ext4_reserve_blocks(struct super_block *sb, int blocks);
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -862,6 +952,7 @@ extern int ext4_change_inode_journal_fla
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
@@ -983,6 +1074,8 @@ extern int ext4_ext_get_blocks(handle_t 
 extern void ext4_ext_truncate(struct inode *, struct page *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+			  loff_t len);
 static inline int
 ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
@@ -996,6 +1089,18 @@ ext4_get_blocks_wrap(handle_t *handle, s
 }
 
 
+/* writeback.c */
+extern int ext4_wb_writepages(struct address_space *, struct writeback_control *);
+extern int ext4_wb_prepare_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to);
+extern int ext4_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int ext4_wb_writepage(struct page *, struct writeback_control *);
+extern void ext4_wb_invalidatepage(struct page *, unsigned long);
+extern int ext4_wb_releasepage(struct page *, gfp_t);
+extern int ext4_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+extern void ext4_wb_init(struct super_block *);
+extern void ext4_wb_release(struct super_block *);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_EXT4_FS_H */
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/extents.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/extents.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/extents.c
@@ -91,36 +91,6 @@ static void ext4_idx_store_pblock(struct
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static int ext4_ext_check_header(const char *function, struct inode *inode,
-				struct ext4_extent_header *eh)
-{
-	const char *error_msg = NULL;
-
-	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
-		error_msg = "invalid magic";
-		goto corrupted;
-	}
-	if (unlikely(eh->eh_max == 0)) {
-		error_msg = "invalid eh_max";
-		goto corrupted;
-	}
-	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
-		error_msg = "invalid eh_entries";
-		goto corrupted;
-	}
-	return 0;
-
-corrupted:
-	ext4_error(inode->i_sb, function,
-			"bad header in inode #%lu: %s - magic %x, "
-			"entries %u, max %u, depth %u",
-			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
-			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
-			le16_to_cpu(eh->eh_depth));
-
-	return -EIO;
-}
-
 static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
@@ -269,6 +239,70 @@ static int ext4_ext_space_root_idx(struc
 	return size;
 }
 
+static inline int
+ext4_ext_max_entries(struct inode *inode, int depth)
+{
+	int max;
+
+	if (depth == ext_depth(inode)) {
+		if (depth == 0)
+			max = ext4_ext_space_root(inode);
+		else
+			max = ext4_ext_space_root_idx(inode);
+	} else {
+		if (depth == 0)
+			max = ext4_ext_space_block(inode);
+		else
+			max = ext4_ext_space_block_idx(inode);
+	}
+
+	return max;
+}
+
+static int __ext4_ext_check_header(const char *function, struct inode *inode,
+					struct ext4_extent_header *eh,
+					int depth)
+{
+	const char *error_msg = NULL;
+	int max = 0;
+
+	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+		error_msg = "invalid magic";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
+		error_msg = "unexpected eh_depth";
+		goto corrupted;
+	}
+	if (unlikely(eh->eh_max == 0)) {
+		error_msg = "invalid eh_max";
+		goto corrupted;
+	}
+	max = ext4_ext_max_entries(inode, depth);
+	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
+		error_msg = "too large eh_max";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+		error_msg = "invalid eh_entries";
+		goto corrupted;
+	}
+	return 0;
+
+corrupted:
+	ext4_error(inode->i_sb, function,
+			"bad header in inode #%lu: %s - magic %x, "
+			"entries %u, max %u(%u), depth %u(%u)",
+			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+			max, le16_to_cpu(eh->eh_depth), depth);
+
+	return -EIO;
+}
+
+#define ext4_ext_check_header(inode,eh,depth)	\
+	__ext4_ext_check_header(__FUNCTION__,inode,eh,depth)
+
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 {
@@ -282,7 +316,7 @@ static void ext4_ext_show_path(struct in
 		} else if (path->p_ext) {
 			ext_debug("  %d:%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
-				  le16_to_cpu(path->p_ext->ee_len),
+				  ext4_ext_get_actual_len(path->p_ext),
 				  ext_pblock(path->p_ext));
 		} else
 			ext_debug("  []");
@@ -305,7 +339,7 @@ static void ext4_ext_show_leaf(struct in
 
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
-			  le16_to_cpu(ex->ee_len), ext_pblock(ex));
+			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
 	}
 	ext_debug("\n");
 }
@@ -329,6 +363,7 @@ static void ext4_ext_drop_refs(struct ex
 /*
  * ext4_ext_binsearch_idx:
  * binary search for the closest index of the given block
+ * the header must be checked before calling this
  */
 static void
 ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
@@ -336,9 +371,6 @@ ext4_ext_binsearch_idx(struct inode *ino
 	struct ext4_extent_header *eh = path->p_hdr;
 	struct ext4_extent_idx *r, *l, *m;
 
-	BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
-	BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
-	BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
 
 	ext_debug("binsearch for %d(idx):  ", block);
 
@@ -388,6 +420,7 @@ ext4_ext_binsearch_idx(struct inode *ino
 /*
  * ext4_ext_binsearch:
  * binary search for closest extent of the given block
+ * the header must be checked before calling this
  */
 static void
 ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
@@ -395,9 +428,6 @@ ext4_ext_binsearch(struct inode *inode, 
 	struct ext4_extent_header *eh = path->p_hdr;
 	struct ext4_extent *r, *l, *m;
 
-	BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
-	BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
-
 	if (eh->eh_entries == 0) {
 		/*
 		 * this leaf is empty:
@@ -425,7 +455,7 @@ ext4_ext_binsearch(struct inode *inode, 
 	ext_debug("  -> %d:%llu:%d ",
 			le32_to_cpu(path->p_ext->ee_block),
 			ext_pblock(path->p_ext),
-			le16_to_cpu(path->p_ext->ee_len));
+			ext4_ext_get_actual_len(path->p_ext));
 
 #ifdef CHECK_BINSEARCH
 	{
@@ -468,11 +498,10 @@ ext4_ext_find_extent(struct inode *inode
 	short int depth, i, ppos = 0, alloc = 0;
 
 	eh = ext_inode_hdr(inode);
-	BUG_ON(eh == NULL);
-	if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+	i = depth = ext_depth(inode);
+	if (ext4_ext_check_header(inode, eh, depth))
 		return ERR_PTR(-EIO);
 
-	i = depth = ext_depth(inode);
 
 	/* account possible depth increase */
 	if (!path) {
@@ -488,6 +517,7 @@ ext4_ext_find_extent(struct inode *inode
 	while (i) {
 		ext_debug("depth %d: num %d, max %d\n",
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+
 		ext4_ext_binsearch_idx(inode, path + ppos, block);
 		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
 		path[ppos].p_depth = i;
@@ -504,7 +534,7 @@ ext4_ext_find_extent(struct inode *inode
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+		if (ext4_ext_check_header(inode, eh, i))
 			goto err;
 	}
 
@@ -513,9 +543,6 @@ ext4_ext_find_extent(struct inode *inode
 	path[ppos].p_ext = NULL;
 	path[ppos].p_idx = NULL;
 
-	if (ext4_ext_check_header(__FUNCTION__, inode, eh))
-		goto err;
-
 	/* find extent */
 	ext4_ext_binsearch(inode, path + ppos, block);
 
@@ -686,7 +713,7 @@ static int ext4_ext_split(handle_t *hand
 		ext_debug("move %d:%llu:%d in new leaf %llu\n",
 				le32_to_cpu(path[depth].p_ext->ee_block),
 				ext_pblock(path[depth].p_ext),
-				le16_to_cpu(path[depth].p_ext->ee_len),
+				ext4_ext_get_actual_len(path[depth].p_ext),
 				newblock);
 		/*memmove(ex++, path[depth].p_ext++,
 				sizeof(struct ext4_extent));
@@ -1106,7 +1133,19 @@ static int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
-	if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+	unsigned short ext1_ee_len, ext2_ee_len;
+
+	/*
+	 * Make sure that either both extents are uninitialized, or
+	 * both are _not_.
+	 */
+	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+		return 0;
+
+	ext1_ee_len = ext4_ext_get_actual_len(ex1);
+	ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
 			le32_to_cpu(ex2->ee_block))
 		return 0;
 
@@ -1115,19 +1154,67 @@ ext4_can_extents_be_merged(struct inode 
 	 * as an RO_COMPAT feature, refuse to merge to extents if
 	 * this can result in the top bit of ee_len being set.
 	 */
-	if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+	if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
 		return 0;
 #ifdef AGGRESSIVE_TEST
 	if (le16_to_cpu(ex1->ee_len) >= 4)
 		return 0;
 #endif
 
-	if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
 		return 1;
 	return 0;
 }
 
 /*
+ * This function tries to merge the "ex" extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass "ex - 1" as argument instead of "ex".
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+int ext4_ext_try_to_merge(struct inode *inode,
+			  struct ext4_ext_path *path,
+			  struct ext4_extent *ex)
+{
+	struct ext4_extent_header *eh;
+	unsigned int depth, len;
+	int merge_done = 0;
+	int uninitialized = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	while (ex < EXT_LAST_EXTENT(eh))
+	{
+		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+			break;
+		/* merge with next extent! */
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+				+ ext4_ext_get_actual_len(ex + 1));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
+
+		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
+			len = (EXT_LAST_EXTENT(eh) - ex - 1)
+				* sizeof(struct ext4_extent);
+			memmove(ex + 1, ex + 2, len);
+		}
+		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
+		merge_done = 1;
+		WARN_ON(eh->eh_entries == 0);
+		if (!eh->eh_entries)
+			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
+			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+	}
+
+	return merge_done;
+}
+
+/*
  * check if a portion of the "newext" extent overlaps with an
  * existing extent.
  *
@@ -1144,7 +1231,7 @@ unsigned int ext4_ext_check_overlap(stru
 	unsigned int ret = 0;
 
 	b1 = le32_to_cpu(newext->ee_block);
-	len1 = le16_to_cpu(newext->ee_len);
+	len1 = ext4_ext_get_actual_len(newext);
 	depth = ext_depth(inode);
 	if (!path[depth].p_ext)
 		goto out;
@@ -1191,8 +1278,9 @@ int ext4_ext_insert_extent(handle_t *han
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
 	int depth, len, err, next;
+	unsigned uninitialized = 0;
 
-	BUG_ON(newext->ee_len == 0);
+	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
 	BUG_ON(path[depth].p_hdr == NULL);
@@ -1200,14 +1288,24 @@ int ext4_ext_insert_extent(handle_t *han
 	/* try to insert block into found extent and return */
 	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append %d block to %d:%d (from %llu)\n",
-				le16_to_cpu(newext->ee_len),
+				ext4_ext_get_actual_len(newext),
 				le32_to_cpu(ex->ee_block),
-				le16_to_cpu(ex->ee_len), ext_pblock(ex));
+				ext4_ext_get_actual_len(ex), ext_pblock(ex));
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			return err;
-		ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
-					 + le16_to_cpu(newext->ee_len));
+
+		/*
+		 * ext4_can_extents_be_merged should have checked that either
+		 * both extents are uninitialized, or both aren't. Thus we
+		 * need to check only one of them here.
+		 */
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
 		eh = path[depth].p_hdr;
 		nearex = ex;
 		goto merge;
@@ -1263,7 +1361,7 @@ has_space:
 		ext_debug("first extent in the leaf: %d:%llu:%d\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
-				le16_to_cpu(newext->ee_len));
+				ext4_ext_get_actual_len(newext));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
 	} else if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
@@ -1276,7 +1374,7 @@ has_space:
 					"move %d from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
 					ext_pblock(newext),
-					le16_to_cpu(newext->ee_len),
+					ext4_ext_get_actual_len(newext),
 					nearex, len, nearex + 1, nearex + 2);
 			memmove(nearex + 2, nearex + 1, len);
 		}
@@ -1289,7 +1387,7 @@ has_space:
 				"move %d from 0x%p to 0x%p\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
-				le16_to_cpu(newext->ee_len),
+				ext4_ext_get_actual_len(newext),
 				nearex, len, nearex + 1, nearex + 2);
 		memmove(nearex + 1, nearex, len);
 		path[depth].p_ext = nearex;
@@ -1304,20 +1402,7 @@ has_space:
 
 merge:
 	/* try to merge extents to the right */
-	while (nearex < EXT_LAST_EXTENT(eh)) {
-		if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
-			break;
-		/* merge with next extent! */
-		nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
-					     + le16_to_cpu(nearex[1].ee_len));
-		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
-			len = (EXT_LAST_EXTENT(eh) - nearex - 1)
-					* sizeof(struct ext4_extent);
-			memmove(nearex + 1, nearex + 2, len);
-		}
-		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
-		BUG_ON(eh->eh_entries == 0);
-	}
+	ext4_ext_try_to_merge(inode, path, nearex);
 
 	/* try to merge extents to the left */
 
@@ -1379,8 +1464,8 @@ int ext4_ext_walk_space(struct inode *in
 			end = le32_to_cpu(ex->ee_block);
 			if (block + num < end)
 				end = block + num;
-		} else if (block >=
-			     le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
 			/* need to allocate space after found extent */
 			start = block;
 			end = block + num;
@@ -1392,7 +1477,8 @@ int ext4_ext_walk_space(struct inode *in
 			 * by found extent
 			 */
 			start = block;
-			end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
 			if (block + num < end)
 				end = block + num;
 			exists = 1;
@@ -1408,7 +1494,7 @@ int ext4_ext_walk_space(struct inode *in
 			cbex.ec_type = EXT4_EXT_CACHE_GAP;
 		} else {
 			cbex.ec_block = le32_to_cpu(ex->ee_block);
-			cbex.ec_len = le16_to_cpu(ex->ee_len);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
 			cbex.ec_start = ext_pblock(ex);
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
@@ -1481,15 +1567,15 @@ ext4_ext_put_gap_in_cache(struct inode *
 		ext_debug("cache gap(before): %lu [%lu:%lu]",
 				(unsigned long) block,
 				(unsigned long) le32_to_cpu(ex->ee_block),
-				(unsigned long) le16_to_cpu(ex->ee_len));
+			        (unsigned long) ext4_ext_get_actual_len(ex));
 	} else if (block >= le32_to_cpu(ex->ee_block)
-			    + le16_to_cpu(ex->ee_len)) {
+		            + ext4_ext_get_actual_len(ex)) {
 		lblock = le32_to_cpu(ex->ee_block)
-			 + le16_to_cpu(ex->ee_len);
+		         + ext4_ext_get_actual_len(ex);
 		len = ext4_ext_next_allocated_block(path);
 		ext_debug("cache gap(after): [%lu:%lu] %lu",
 				(unsigned long) le32_to_cpu(ex->ee_block),
-				(unsigned long) le16_to_cpu(ex->ee_len),
+			        (unsigned long) ext4_ext_get_actual_len(ex),
 				(unsigned long) block);
 		BUG_ON(len == lblock);
 		len = len - lblock;
@@ -1619,12 +1705,12 @@ static int ext4_remove_blocks(handle_t *
 				unsigned long from, unsigned long to)
 {
 	struct buffer_head *bh;
+	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
 	int i;
 
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-		unsigned short ee_len =  le16_to_cpu(ex->ee_len);
 		spin_lock(&sbi->s_ext_stats_lock);
 		sbi->s_ext_blocks += ee_len;
 		sbi->s_ext_extents++;
@@ -1638,12 +1724,12 @@ static int ext4_remove_blocks(handle_t *
 	}
 #endif
 	if (from >= le32_to_cpu(ex->ee_block)
-	    && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		unsigned long num;
 		ext4_fsblk_t start;
-		num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
-		start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+		num = le32_to_cpu(ex->ee_block) + ee_len - from;
+		start = ext_pblock(ex) + ee_len - num;
 		ext_debug("free last %lu blocks starting %llu\n", num, start);
 		for (i = 0; i < num; i++) {
 			bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1651,12 +1737,12 @@ static int ext4_remove_blocks(handle_t *
 		}
 		ext4_free_blocks(handle, inode, start, num);
 	} else if (from == le32_to_cpu(ex->ee_block)
-		   && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		printk("strange request: removal %lu-%lu from %u:%u\n",
-		       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+			from, to, le32_to_cpu(ex->ee_block), ee_len);
 	} else {
 		printk("strange request: removal(2) %lu-%lu from %u:%u\n",
-		       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+			from, to, le32_to_cpu(ex->ee_block), ee_len);
 	}
 	return 0;
 }
@@ -1671,21 +1757,23 @@ ext4_ext_rm_leaf(handle_t *handle, struc
 	unsigned a, b, block, num;
 	unsigned long ex_ee_block;
 	unsigned short ex_ee_len;
+	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
 
+	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %lu in leaf\n", start);
 	if (!path[depth].p_hdr)
 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
 	eh = path[depth].p_hdr;
 	BUG_ON(eh == NULL);
-	BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
-	BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
 
 	/* find where to start removing */
 	ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
-	ex_ee_len = le16_to_cpu(ex->ee_len);
+	if (ext4_ext_is_uninitialized(ex))
+		uninitialized = 1;
+	ex_ee_len = ext4_ext_get_actual_len(ex);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
@@ -1753,6 +1841,8 @@ ext4_ext_rm_leaf(handle_t *handle, struc
 
 		ex->ee_block = cpu_to_le32(block);
 		ex->ee_len = cpu_to_le16(num);
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		if (err)
@@ -1762,7 +1852,7 @@ ext4_ext_rm_leaf(handle_t *handle, struc
 				ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
-		ex_ee_len = le16_to_cpu(ex->ee_len);
+		ex_ee_len = ext4_ext_get_actual_len(ex);
 	}
 
 	if (correct_index && eh->eh_entries)
@@ -1825,7 +1915,7 @@ int ext4_ext_remove_space(struct inode *
 		return -ENOMEM;
 	}
 	path[0].p_hdr = ext_inode_hdr(inode);
-	if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
+	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
@@ -1846,17 +1936,8 @@ int ext4_ext_remove_space(struct inode *
 		if (!path[i].p_hdr) {
 			ext_debug("initialize header\n");
 			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
-			if (ext4_ext_check_header(__FUNCTION__, inode,
-							path[i].p_hdr)) {
-				err = -EIO;
-				goto out;
-			}
 		}
 
-		BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
-			   > le16_to_cpu(path[i].p_hdr->eh_max));
-		BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
-
 		if (!path[i].p_idx) {
 			/* this level hasn't been touched yet */
 			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
@@ -1873,17 +1954,24 @@ int ext4_ext_remove_space(struct inode *
 				i, EXT_FIRST_INDEX(path[i].p_hdr),
 				path[i].p_idx);
 		if (ext4_ext_more_to_rm(path + i)) {
+			struct buffer_head *bh;
 			/* go to the next level */
 			ext_debug("move to level %d (block %llu)\n",
 				  i + 1, idx_pblock(path[i].p_idx));
 			memset(path + i + 1, 0, sizeof(*path));
-			path[i+1].p_bh =
-				sb_bread(sb, idx_pblock(path[i].p_idx));
-			if (!path[i+1].p_bh) {
+			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+			if (!bh) {
 				/* should we reset i_size? */
 				err = -EIO;
 				break;
 			}
+			BUG_ON(i + 1 > depth);
+			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+							depth - i - 1)) {
+				err = -EIO;
+				break;
+			}
+			path[i+1].p_bh = bh;
 
 			/* save actual number of indexes since this
 			 * number is changed at the next iteration */
@@ -1977,15 +2065,152 @@ void ext4_ext_release(struct super_block
 #endif
 }
 
+/*
+ * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * to an uninitialized extent. It may result in splitting the uninitialized
+ * extent into multiple extents (upto three - one initialized and two
+ * uninitialized).
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be initialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ */
+int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+					struct ext4_ext_path *path,
+					ext4_fsblk_t iblock,
+					unsigned long max_blocks)
+{
+	struct ext4_extent *ex, newex;
+	struct ext4_extent *ex1 = NULL;
+	struct ext4_extent *ex2 = NULL;
+	struct ext4_extent *ex3 = NULL;
+	struct ext4_extent_header *eh;
+	unsigned int allocated, ee_block, ee_len, depth;
+	ext4_fsblk_t newblock;
+	int err = 0;
+	int ret = 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	allocated = ee_len - (iblock - ee_block);
+	newblock = iblock - ee_block + ext_pblock(ex);
+	ex2 = ex;
+
+	/* ex1: ee_block to iblock - 1 : uninitialized */
+	if (iblock > ee_block) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/* for sanity, update the length of the ex2 extent before
+	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
+	 * overlap of blocks.
+	 */
+	if (!ex1 && allocated > max_blocks)
+		ex2->ee_len = cpu_to_le16(max_blocks);
+	/* ex3: to ee_block + ee_len : uninitialised */
+	if (allocated > max_blocks) {
+		unsigned int newdepth;
+		ex3 = &newex;
+		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+		ext4_ext_store_pblock(ex3, newblock + max_blocks);
+		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ext4_ext_mark_uninitialized(ex3);
+		err = ext4_ext_insert_extent(handle, inode, path, ex3);
+		if (err)
+			goto out;
+		/* The depth, and hence eh & ex might change
+		 * as part of the insert above.
+		 */
+		newdepth = ext_depth(inode);
+		if (newdepth != depth) {
+			depth = newdepth;
+			path = ext4_ext_find_extent(inode, iblock, NULL);
+			if (IS_ERR(path)) {
+				err = PTR_ERR(path);
+				path = NULL;
+				goto out;
+			}
+			eh = path[depth].p_hdr;
+			ex = path[depth].p_ext;
+			if (ex2 != &newex)
+				ex2 = ex;
+		}
+		allocated = max_blocks;
+	}
+	/* If there was a change of depth as part of the
+	 * insertion of ex3 above, we need to update the length
+	 * of the ex1 extent again here
+	 */
+	if (ex1 && ex1 != ex) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/* ex2: iblock to iblock + maxblocks-1 : initialised */
+	ex2->ee_block = cpu_to_le32(iblock);
+	ex2->ee_start = cpu_to_le32(newblock);
+	ext4_ext_store_pblock(ex2, newblock);
+	ex2->ee_len = cpu_to_le16(allocated);
+	if (ex2 != ex)
+		goto insert;
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* New (initialized) extent starts from the first block
+	 * in the current extent. i.e., ex2 == ex
+	 * We have to see if it can be merged with the extent
+	 * on the left.
+	 */
+	if (ex2 > EXT_FIRST_EXTENT(eh)) {
+		/* To merge left, pass "ex2 - 1" to try_to_merge(),
+		 * since it merges towards right _only_.
+		 */
+		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+			depth = ext_depth(inode);
+			ex2--;
+		}
+	}
+	/* Try to Merge towards right. This might be required
+	 * only when the whole extent is being written to.
+	 * i.e. ex2 == ex and ex3 == NULL.
+	 */
+	if (!ex3) {
+		ret = ext4_ext_try_to_merge(inode, path, ex2);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+		}
+	}
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	goto out;
+insert:
+	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+out:
+	return err ? err : allocated;
+}
+
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize)
 {
 	struct ext4_ext_path *path = NULL;
+	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex;
 	ext4_fsblk_t goal, newblock;
-	int err = 0, depth;
+	int err = 0, depth, ret;
 	unsigned long allocated = 0;
 
 	__clear_bit(BH_New, &bh_result->b_state);
@@ -2033,21 +2258,19 @@ int ext4_ext_get_blocks(handle_t *handle
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
 	BUG_ON(path[depth].p_ext == NULL && depth != 0);
+	eh = path[depth].p_hdr;
 
 	ex = path[depth].p_ext;
 	if (ex) {
 		unsigned long ee_block = le32_to_cpu(ex->ee_block);
 		ext4_fsblk_t ee_start = ext_pblock(ex);
-		unsigned short ee_len  = le16_to_cpu(ex->ee_len);
+		unsigned short ee_len;
 
 		/*
-		 * Allow future support for preallocated extents to be added
-		 * as an RO_COMPAT feature:
 		 * Uninitialized extents are treated as holes, except that
-		 * we avoid (fail) allocating new blocks during a write.
+		 * we split out initialized portions during a write.
 		 */
-		if (ee_len > EXT_MAX_LEN)
-			goto out2;
+		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
 		if (iblock >= ee_block && iblock < ee_block + ee_len) {
 			newblock = iblock - ee_block + ee_start;
@@ -2055,9 +2278,27 @@ int ext4_ext_get_blocks(handle_t *handle
 			allocated = ee_len - (iblock - ee_block);
 			ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
 					ee_block, ee_len, newblock);
-			ext4_ext_put_in_cache(inode, ee_block, ee_len,
-						ee_start, EXT4_EXT_CACHE_EXTENT);
-			goto out;
+
+			/* Do not put uninitialized extent in the cache */
+			if (!ext4_ext_is_uninitialized(ex)) {
+				ext4_ext_put_in_cache(inode, ee_block,
+							ee_len, ee_start,
+							EXT4_EXT_CACHE_EXTENT);
+				goto out;
+			}
+			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+				goto out;
+			if (!create)
+				goto out2;
+
+			ret = ext4_ext_convert_to_initialized(handle, inode,
+								path, iblock,
+								max_blocks);
+			if (ret <= 0)
+				goto out2;
+			else
+				allocated = ret;
+			goto outnew;
 		}
 	}
 
@@ -2098,6 +2339,8 @@ int ext4_ext_get_blocks(handle_t *handle
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(allocated);
+	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+		ext4_ext_mark_uninitialized(&newex);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
 	if (err) {
 		/* free data blocks we just allocated */
@@ -2111,10 +2354,13 @@ int ext4_ext_get_blocks(handle_t *handle
 
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
+outnew:
 	__set_bit(BH_New, &bh_result->b_state);
 
-	ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
-				EXT4_EXT_CACHE_EXTENT);
+	/* Cache only when it is _not_ an uninitialized extent */
+	if (create!=EXT4_CREATE_UNINITIALIZED_EXT)
+		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+						EXT4_EXT_CACHE_EXTENT);
 out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
@@ -2217,3 +2463,158 @@ int ext4_ext_writepage_trans_blocks(stru
 
 	return needed;
 }
+
+/*
+ * preallocate space for a file. This implements ext4's fallocate inode
+ * operation, which gets called from sys_fallocate system call.
+ * Currently only FA_ALLOCATE mode is supported on extent based files.
+ * We may have more modes supported in future - like FA_DEALLOCATE, which
+ * tells fallocate to unallocate previously (pre)allocated blocks.
+ * For block-mapped files, posix_fallocate should fall back to the method
+ * of writing zeroes to the required new blocks (the same behavior which is
+ * expected for file systems which do not support fallocate() system call).
+ */
+long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+{
+	handle_t *handle;
+	ext4_fsblk_t block, max_blocks;
+	ext4_fsblk_t nblocks = 0;
+	int ret = 0;
+	int ret2 = 0;
+	int retries = 0;
+	struct buffer_head map_bh;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	/*
+	 * currently supporting (pre)allocate mode for extent-based
+	 * files _only_
+	 */
+	if (mode != FA_ALLOCATE || !(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return -EOPNOTSUPP;
+
+	/* preallocation to directories is currently not supported */
+	if (S_ISDIR(inode->i_mode))
+		return -ENODEV;
+
+	block = offset >> blkbits;
+	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+		 	- block;
+
+	/*
+	 * credits to insert 1 extent into extent tree + buffers to be able to
+	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
+	 */
+	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+retry:
+	while (ret >= 0 && ret < max_blocks) {
+		block = block + ret;
+		max_blocks = max_blocks - ret;
+		handle = ext4_journal_start(inode, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+
+		ret = ext4_ext_get_blocks(handle, inode, block,
+					  max_blocks, &map_bh,
+					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
+		WARN_ON(!ret);
+		if (!ret) {
+			ext4_error(inode->i_sb, "ext4_fallocate",
+				   "ext4_ext_get_blocks returned 0! inode#%lu"
+				   ", block=%llu, max_blocks=%llu",
+				   inode->i_ino, block, max_blocks);
+			ret = -EIO;
+			ext4_mark_inode_dirty(handle, inode);
+			ret2 = ext4_journal_stop(handle);
+			break;
+		}
+		if (ret > 0) {
+			/* check wrap through sign-bit/zero here */
+			if ((block + ret) < 0 || (block + ret) < block) {
+				ret = -EIO;
+				ext4_mark_inode_dirty(handle, inode);
+				ret2 = ext4_journal_stop(handle);
+				break;
+			}
+			if (buffer_new(&map_bh) && ((block + ret) >
+			    (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
+			    >> blkbits)))
+					nblocks = nblocks + ret;
+		}
+
+		/* Update ctime if new blocks get allocated */
+		if (nblocks) {
+			struct timespec now;
+			now = current_fs_time(inode->i_sb);
+			if (!timespec_equal(&inode->i_ctime, &now))
+				inode->i_ctime = now;
+		}
+
+		ext4_mark_inode_dirty(handle, inode);
+		ret2 = ext4_journal_stop(handle);
+		if (ret2)
+			break;
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	/*
+	 * Time to update the file size.
+	 * Update only when preallocation was requested beyond the file size.
+	 */
+	if ((offset + len) > i_size_read(inode)) {
+		if (ret > 0) {
+			/*
+			 * if no error, we assume preallocation succeeded
+			 * completely
+			 */
+			mutex_lock(&inode->i_mutex);
+			i_size_write(inode, offset + len);
+			EXT4_I(inode)->i_disksize = i_size_read(inode);
+			mutex_unlock(&inode->i_mutex);
+		} else if (ret < 0 && nblocks) {
+			/* Handle partial allocation scenario */
+			loff_t newsize;
+
+			mutex_lock(&inode->i_mutex);
+			newsize  = (nblocks << blkbits) + i_size_read(inode);
+			i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
+			EXT4_I(inode)->i_disksize = i_size_read(inode);
+			mutex_unlock(&inode->i_mutex);
+		}
+	}
+
+	return ret > 0 ? ret2 : ret;
+}
+
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+  	int lcap, icap, rcap, leafs, idxs, num;
+
+  	rcap = ext4_ext_space_root(inode);
+  	if (blocks <= rcap) {
+  		/* all extents fit to the root */
+  		return 0;
+  	}
+
+  	rcap = ext4_ext_space_root_idx(inode);
+  	lcap = ext4_ext_space_block(inode);
+  	icap = ext4_ext_space_block_idx(inode);
+
+  	num = leafs = (blocks + lcap - 1) / lcap;
+  	if (leafs <= rcap) {
+  		/* all pointers to leafs fit to the root */
+  		return leafs;
+  	}
+
+  	/* ok. we need separate index block(s) to link all leaf blocks */
+  	idxs = (leafs + icap - 1) / icap;
+  	do {
+  		num += idxs;
+  		idxs = (idxs + icap - 1) / icap;
+  	} while (idxs > rcap);
+
+  	return num;
+}
Index: linux-2.6.22-rc4-kamikaze1/arch/i386/kernel/syscall_table.S
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/i386/kernel/syscall_table.S
+++ linux-2.6.22-rc4-kamikaze1/arch/i386/kernel/syscall_table.S
@@ -323,2 +323,3 @@ ENTRY(sys_call_table)
 	.long sys_timerfd
 	.long sys_eventfd
+	.long sys_fallocate
Index: linux-2.6.22-rc4-kamikaze1/arch/powerpc/kernel/sys_ppc32.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/powerpc/kernel/sys_ppc32.c
+++ linux-2.6.22-rc4-kamikaze1/arch/powerpc/kernel/sys_ppc32.c
@@ -773,6 +773,13 @@ asmlinkage int compat_sys_truncate64(con
 	return sys_truncate(path, (high << 32) | low);
 }
 
+asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo,
+				     u32 lenhi, u32 lenlo)
+{
+	return sys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo,
+			     ((loff_t)lenhi << 32) | lenlo);
+}
+
 asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
 				 unsigned long low)
 {
Index: linux-2.6.22-rc4-kamikaze1/arch/x86_64/ia32/ia32entry.S
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/x86_64/ia32/ia32entry.S
+++ linux-2.6.22-rc4-kamikaze1/arch/x86_64/ia32/ia32entry.S
@@ -719,4 +719,5 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd
 	.quad compat_sys_timerfd
 	.quad sys_eventfd
+	.quad sys_fallocate
 ia32_syscall_end:
Index: linux-2.6.22-rc4-kamikaze1/fs/open.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/open.c
+++ linux-2.6.22-rc4-kamikaze1/fs/open.c
@@ -353,6 +353,92 @@ asmlinkage long sys_ftruncate64(unsigned
 #endif
 
 /*
+ * sys_fallocate - preallocate blocks or free preallocated blocks
+ * @fd: the file descriptor
+ * @mode: mode specifies if fallocate should preallocate blocks OR free
+ *	  (unallocate) preallocated blocks. Currently only FA_ALLOCATE and
+ *	  FA_DEALLOCATE modes are supported.
+ * @offset: The offset within file, from where (un)allocation is being
+ *	    requested. It should not have a negative value.
+ * @len: The amount (in bytes) of space to be (un)allocated, from the offset.
+ *
+ * This system call, depending on the mode, preallocates or unallocates blocks
+ * for a file. The range of blocks depends on the value of offset and len
+ * arguments provided by the user/application. For FA_ALLOCATE mode, if this
+ * system call succeeds, subsequent writes to the file in the given range
+ * (specified by offset & len) should not fail - even if the file system
+ * later becomes full. Hence the preallocation done is persistent (valid
+ * even after reopen of the file and remount/reboot).
+ *
+ * It is expected that the ->fallocate() inode operation implemented by the
+ * individual file systems will update the file size and/or ctime/mtime
+ * depending on the mode and also on the success of the operation.
+ *
+ * Note: Incase the file system does not support preallocation,
+ * posix_fallocate() should fall back to the library implementation (i.e.
+ * allocating zero-filled new blocks to the file).
+ *
+ * Return Values
+ *	0	: On SUCCESS a value of zero is returned.
+ *	error	: On Failure, an error code will be returned.
+ * An error code of -ENOSYS or -EOPNOTSUPP should make posix_fallocate()
+ * fall back on library implementation of fallocate.
+ *
+ * <TBD> Generic fallocate to be added for file systems that do not
+ *	 support fallocate it.
+ */
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+	struct file *file;
+	struct inode *inode;
+	long ret = -EINVAL;
+
+	if (offset < 0 || len <= 0)
+		goto out;
+
+	/* Return error if mode is not supported */
+	ret = -EOPNOTSUPP;
+	if (mode != FA_ALLOCATE && mode !=FA_DEALLOCATE)
+		goto out;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+	if (!(file->f_mode & FMODE_WRITE))
+		goto out_fput;
+
+	inode = file->f_path.dentry->d_inode;
+
+	ret = -ESPIPE;
+	if (S_ISFIFO(inode->i_mode))
+		goto out_fput;
+
+	ret = -ENODEV;
+	/*
+	 * Let individual file system decide if it supports preallocation
+	 * for directories or not.
+	 */
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EFBIG;
+	/* Check for wrap through zero too */
+	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+		goto out_fput;
+
+	if (inode->i_op && inode->i_op->fallocate)
+		ret = inode->i_op->fallocate(inode, mode, offset, len);
+	else
+		ret = -ENOSYS;
+
+out_fput:
+	fput(file);
+out:
+	return ret;
+}
+
+/*
  * access() needs to use the real uid/gid, not the effective uid/gid.
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
Index: linux-2.6.22-rc4-kamikaze1/include/asm-i386/unistd.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/asm-i386/unistd.h
+++ linux-2.6.22-rc4-kamikaze1/include/asm-i386/unistd.h
@@ -329,9 +329,10 @@
 #define __NR_timerfd		322
 #define __NR_eventfd		323
+#define __NR_fallocate		324
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 324
+#define NR_syscalls 325
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6.22-rc4-kamikaze1/include/asm-powerpc/systbl.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/asm-powerpc/systbl.h
+++ linux-2.6.22-rc4-kamikaze1/include/asm-powerpc/systbl.h
@@ -308,6 +308,7 @@ COMPAT_SYS_SPU(move_pages)
 SYSCALL_SPU(getcpu)
 COMPAT_SYS(epoll_pwait)
 COMPAT_SYS_SPU(utimensat)
+COMPAT_SYS(fallocate)
 COMPAT_SYS_SPU(signalfd)
 COMPAT_SYS_SPU(timerfd)
 SYSCALL_SPU(eventfd)
Index: linux-2.6.22-rc4-kamikaze1/include/asm-powerpc/unistd.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/asm-powerpc/unistd.h
+++ linux-2.6.22-rc4-kamikaze1/include/asm-powerpc/unistd.h
@@ -330,10 +330,11 @@
 #define __NR_timerfd		306
 #define __NR_eventfd		307
 #define __NR_sync_file_range2	308
+#define __NR_fallocate		309
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		309
+#define __NR_syscalls		310
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
Index: linux-2.6.22-rc4-kamikaze1/include/asm-x86_64/unistd.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/asm-x86_64/unistd.h
+++ linux-2.6.22-rc4-kamikaze1/include/asm-x86_64/unistd.h
@@ -630,6 +630,8 @@ __SYSCALL(__NR_signalfd, sys_signalfd)
 __SYSCALL(__NR_timerfd, sys_timerfd)
 #define __NR_eventfd		283
 __SYSCALL(__NR_eventfd, sys_eventfd)
+#define __NR_fallocate		284
+__SYSCALL(__NR_fallocate, sys_fallocate)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6.22-rc4-kamikaze1/include/linux/fs.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/fs.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/fs.h
@@ -266,6 +266,17 @@ extern int dir_notify_enable;
 #define SYNC_FILE_RANGE_WRITE		2
 #define SYNC_FILE_RANGE_WAIT_AFTER	4
 
+/*
+ * sys_fallocate modes
+ * Currently sys_fallocate supports two modes:
+ * FA_ALLOCATE  : This is the preallocate mode, using which an application/user
+ *		  may request (pre)allocation of blocks.
+ * FA_DEALLOCATE: This is the deallocate mode, which can be used to free
+ *		  the preallocated blocks.
+ */
+#define FA_ALLOCATE	0x1
+#define FA_DEALLOCATE	0x2
+
 #ifdef __KERNEL__
 
 #include <linux/linkage.h>
@@ -1138,6 +1149,8 @@ struct inode_operations {
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
+	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
+			  loff_t len);
 };
 
 struct seq_file;
Index: linux-2.6.22-rc4-kamikaze1/include/linux/syscalls.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/syscalls.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/syscalls.h
@@ -608,6 +608,7 @@ asmlinkage long sys_signalfd(int ufd, si
 asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
 			    const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
Index: linux-2.6.22-rc4-kamikaze1/arch/s390/kernel/compat_wrapper.S
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/s390/kernel/compat_wrapper.S
+++ linux-2.6.22-rc4-kamikaze1/arch/s390/kernel/compat_wrapper.S
@@ -1683,6 +1683,16 @@ compat_sys_utimes_wrapper:
 	llgtr	%r3,%r3			# struct compat_timeval *
 	jg	compat_sys_utimes
 
+	.globl  sys_fallocate_wrapper
+sys_fallocate_wrapper:
+	lgfr	%r2,%r2			# int
+	lgfr	%r3,%r3			# int
+	sllg    %r4,%r4,32		# get high word of 64bit loff_t
+	lr      %r4,%r5			# get low word of 64bit loff_t
+	sllg    %r5,%r6,32		# get high word of 64bit loff_t
+	l	%r5,164(%r15)		# get low word of 64bit loff_t
+	jg	sys_fallocate
+
 	.globl	compat_sys_utimensat_wrapper
 compat_sys_utimensat_wrapper:
 	llgfr	%r2,%r2			# unsigned int
Index: linux-2.6.22-rc4-kamikaze1/arch/s390/kernel/sys_s390.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/s390/kernel/sys_s390.c
+++ linux-2.6.22-rc4-kamikaze1/arch/s390/kernel/sys_s390.c
@@ -265,3 +265,32 @@ s390_fadvise64_64(struct fadvise64_64_ar
 		return -EFAULT;
 	return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
 }
+
+#ifndef CONFIG_64BIT
+/*
+ * This is a wrapper to call sys_fallocate(). For 31 bit s390 the last
+ * 64 bit argument "len" is split into the upper and lower 32 bits. The
+ * system call wrapper in the user space loads the value to %r6/%r7.
+ * The code in entry.S keeps the values in %r2 - %r6 where they are and
+ * stores %r7 to 96(%r15). But the standard C linkage requires that
+ * the whole 64 bit value for len is stored on the stack and doesn't
+ * use %r6 at all. So s390_fallocate has to convert the arguments from
+ *   %r2: fd, %r3: mode, %r4/%r5: offset, %r6/96(%r15)-99(%r15): len
+ * to
+ *   %r2: fd, %r3: mode, %r4/%r5: offset, 96(%r15)-103(%r15): len
+ */
+asmlinkage long s390_fallocate(int fd, int mode, loff_t offset,
+			       u32 len_high, u32 len_low)
+{
+	union {
+		u64 len;
+		struct {
+			u32 high;
+			u32 low;
+		};
+	} cv;
+	cv.high = len_high;
+	cv.low = len_low;
+	return sys_fallocate(fd, mode, offset, cv.len);
+}
+#endif
Index: linux-2.6.22-rc4-kamikaze1/arch/s390/kernel/syscalls.S
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/s390/kernel/syscalls.S
+++ linux-2.6.22-rc4-kamikaze1/arch/s390/kernel/syscalls.S
@@ -322,6 +322,7 @@ NI_SYSCALL							/* 310 sys_move_pages *
 SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper)
 SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper)
 SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
+SYSCALL(s390_fallocate,sys_fallocate,sys_fallocate_wrapper)
 NI_SYSCALL							/* 314 sys_fallocate */
 SYSCALL(sys_utimensat,sys_utimensat,compat_sys_utimensat_wrapper)	/* 315 */
 SYSCALL(sys_signalfd,sys_signalfd,compat_sys_signalfd_wrapper)
Index: linux-2.6.22-rc4-kamikaze1/include/asm-s390/unistd.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/asm-s390/unistd.h
+++ linux-2.6.22-rc4-kamikaze1/include/asm-s390/unistd.h
@@ -256,7 +256,8 @@
 #define __NR_signalfd		316
 #define __NR_timerfd		317
 #define __NR_eventfd		318
-#define NR_syscalls 319
+#define __NR_fallocate		319
+#define NR_syscalls 320
 
 /* 
  * There are some system calls that are not present on 64 bit, some
Index: linux-2.6.22-rc4-kamikaze1/arch/ia64/kernel/entry.S
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/arch/ia64/kernel/entry.S
+++ linux-2.6.22-rc4-kamikaze1/arch/ia64/kernel/entry.S
@@ -1588,5 +1588,6 @@ sys_call_table:
 	data8 sys_signalfd
 	data8 sys_timerfd
 	data8 sys_eventfd
+	data8 sys_fallocate			// 1310
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
Index: linux-2.6.22-rc4-kamikaze1/include/asm-ia64/unistd.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/asm-ia64/unistd.h
+++ linux-2.6.22-rc4-kamikaze1/include/asm-ia64/unistd.h
@@ -299,11 +299,12 @@
 #define __NR_signalfd			1307
 #define __NR_timerfd			1308
 #define __NR_eventfd			1309
+#define __NR_fallocate			1310
 
 #ifdef __KERNEL__
 
 
-#define NR_syscalls			286 /* length of syscall table */
+#define NR_syscalls			287 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/file.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/file.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/file.c
@@ -35,8 +35,8 @@ static int ext4_release_file (struct ino
 {
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1))
-	{
+			(atomic_read(&inode->i_writecount) == 1) &&
+			EXT4_I(inode)->i_blocks_reserved == 0) {
 		mutex_lock(&EXT4_I(inode)->truncate_mutex);
 		ext4_discard_reservation(inode);
 		mutex_unlock(&EXT4_I(inode)->truncate_mutex);
@@ -135,5 +135,6 @@ const struct inode_operations ext4_file_
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext4_permission,
+	.fallocate	= ext4_fallocate,
 };
 
Index: linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs_extents.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/ext4_fs_extents.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs_extents.h
@@ -188,12 +188,28 @@ ext4_ext_invalidate_cache(struct inode *
 	EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
 }
 
+static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) {
+	ext->ee_len |= cpu_to_le16(0x8000);
+}
+
+static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) {
+	return (int)(le16_to_cpu((ext)->ee_len) & 0x8000);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) {
+	return (int)(le16_to_cpu((ext)->ee_len) & 0x7FFF);
+}
+
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_try_to_merge(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
 extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
 extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 
 #endif /* _LINUX_EXT4_EXTENTS */
 
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/ialloc.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/ialloc.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/ialloc.c
@@ -563,7 +563,8 @@ got:
 	inode->i_ino = ino;
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+						       ext4_current_time(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
@@ -595,9 +596,8 @@ got:
 	spin_unlock(&sbi->s_next_gen_lock);
 
 	ei->i_state = EXT4_STATE_NEW;
-	ei->i_extra_isize =
-		(EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
-		sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
+
+	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
 	if(DQUOT_ALLOC_INODE(inode)) {
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/namei.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/namei.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/namei.c
@@ -1285,7 +1285,7 @@ static int add_dirent_to_buf(handle_t *h
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
@@ -1619,6 +1619,27 @@ static int ext4_delete_entry (handle_t *
 	return -ENOENT;
 }
 
+static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+	if (is_dx(inode) && inode->i_nlink > 1) {
+		/* limit is 16-bit i_links_count */
+		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
+			inode->i_nlink = 1;
+			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
+					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+		}
+	}
+}
+
+static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	drop_nlink(inode);
+	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
+		inc_nlink(inode);
+}
+
+
 static int ext4_add_nondir(handle_t *handle,
 		struct dentry *dentry, struct inode *inode)
 {
@@ -1715,7 +1736,7 @@ static int ext4_mkdir(struct inode * dir
 	struct ext4_dir_entry_2 * de;
 	int err, retries = 0;
 
-	if (dir->i_nlink >= EXT4_LINK_MAX)
+	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
 retry:
@@ -1738,7 +1759,7 @@ retry:
 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
 	dir_block = ext4_bread (handle, inode, 0, 1, &err);
 	if (!dir_block) {
-		drop_nlink(inode); /* is this nlink == 0? */
+		ext4_dec_count(handle, inode); /* is this nlink == 0? */
 		ext4_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1770,7 +1791,7 @@ retry:
 		iput (inode);
 		goto out_stop;
 	}
-	inc_nlink(dir);
+	ext4_inc_count(handle, dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
@@ -2035,10 +2056,10 @@ static int ext4_rmdir (struct inode * di
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
-	if (inode->i_nlink != 2)
-		ext4_warning (inode->i_sb, "ext4_rmdir",
-			      "empty directory has nlink!=2 (%d)",
-			      inode->i_nlink);
+	if (!EXT4_DIR_LINK_EMPTY(inode))
+		ext4_warning(inode->i_sb, "ext4_rmdir",
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
 	/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2046,9 +2067,9 @@ static int ext4_rmdir (struct inode * di
 	 * recovery. */
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
-	drop_nlink(dir);
+	ext4_dec_count(handle, dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 
@@ -2096,13 +2117,13 @@ static int ext4_unlink(struct inode * di
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_unlink;
-	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
-	drop_nlink(inode);
+	ext4_dec_count(handle, inode);
 	if (!inode->i_nlink)
 		ext4_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime;
+	inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	retval = 0;
 
@@ -2149,7 +2170,7 @@ retry:
 		err = __page_symlink(inode, symname, l,
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
-			drop_nlink(inode);
+			ext4_dec_count(handle, inode);
 			ext4_mark_inode_dirty(handle, inode);
 			iput (inode);
 			goto out_stop;
@@ -2175,8 +2196,9 @@ static int ext4_link (struct dentry * ol
 	struct inode *inode = old_dentry->d_inode;
 	int err, retries = 0;
 
-	if (inode->i_nlink >= EXT4_LINK_MAX)
+	if (EXT4_DIR_LINK_MAX(inode))
 		return -EMLINK;
+
 	/*
 	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
 	 * otherwise has the potential to corrupt the orphan inode list.
@@ -2193,8 +2215,8 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode->i_ctime = CURRENT_TIME_SEC;
-	inc_nlink(inode);
+	inode->i_ctime = ext4_current_time(inode);
+	ext4_inc_count(handle, inode);
 	atomic_inc(&inode->i_count);
 
 	err = ext4_add_nondir(handle, dentry, inode);
@@ -2295,7 +2317,7 @@ static int ext4_rename (struct inode * o
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old_inode->i_ctime = CURRENT_TIME_SEC;
+	old_inode->i_ctime = ext4_current_time(old_inode);
 	ext4_mark_inode_dirty(handle, old_inode);
 
 	/*
@@ -2327,10 +2349,10 @@ static int ext4_rename (struct inode * o
 	}
 
 	if (new_inode) {
-		drop_nlink(new_inode);
-		new_inode->i_ctime = CURRENT_TIME_SEC;
+		ext4_dec_count(handle, new_inode);
+		new_inode->i_ctime = ext4_current_time(new_inode);
 	}
-	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
 	ext4_update_dx_flag(old_dir);
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");
@@ -2338,11 +2360,13 @@ static int ext4_rename (struct inode * o
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
 		ext4_journal_dirty_metadata(handle, dir_bh);
-		drop_nlink(old_dir);
+		ext4_dec_count(handle, old_dir);
 		if (new_inode) {
-			drop_nlink(new_inode);
+			/* checked empty_dir above, can't have another parent,
+			 * ext3_dec_count() won't work for many-linked dirs */
+			new_inode->i_nlink = 0;
 		} else {
-			inc_nlink(new_dir);
+			ext4_inc_count(handle, new_dir);
 			ext4_update_dx_flag(new_dir);
 			ext4_mark_inode_dirty(handle, new_dir);
 		}
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/xattr.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/xattr.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/xattr.c
@@ -66,13 +66,6 @@
 #define BFIRST(bh) ENTRY(BHDR(bh)+1)
 #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
 
-#define IHDR(inode, raw_inode) \
-	((struct ext4_xattr_ibody_header *) \
-		((void *)raw_inode + \
-		 EXT4_GOOD_OLD_INODE_SIZE + \
-		 EXT4_I(inode)->i_extra_isize))
-#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-
 #ifdef EXT4_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
 		printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -508,6 +501,20 @@ out:
 	return;
 }
 
+static inline size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+				    size_t *min_offs, void *base, int *total)
+{
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		*total += EXT4_XATTR_LEN(last->e_name_len);
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < *min_offs)
+				*min_offs = offs;
+		}
+	}
+	return (*min_offs - ((void *)last - base) - sizeof(__u32));
+}
+
 struct ext4_xattr_info {
 	int name_index;
 	const char *name;
@@ -606,6 +613,7 @@ ext4_xattr_set_entry(struct ext4_xattr_i
 			memmove(s->here, (void *)s->here + size,
 				(void *)last - (void *)s->here + sizeof(__u32));
 			memset(last, 0, size);
+
 		}
 	}
 
@@ -1013,7 +1021,9 @@ ext4_xattr_set_handle(handle_t *handle, 
 	}
 	if (!error) {
 		ext4_xattr_update_super_block(handle, inode->i_sb);
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = ext4_current_time(inode);
+		if(!value)
+			EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1066,6 +1076,240 @@ retry:
 	return error;
 }
 
+static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+				     int value_offs_shift, void *to,
+				     void *from, size_t n, int blocksize)
+{
+	struct ext4_xattr_entry *last = entry;
+	int new_offs;
+
+	/* Adjust the value offsets of the entries */
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			new_offs = le16_to_cpu(last->e_value_offs) +
+							value_offs_shift;
+			BUG_ON(new_offs + le32_to_cpu(last->e_value_size) > blocksize);
+			last->e_value_offs = cpu_to_le16(new_offs);
+		}
+	}
+	/* Shift the entries by n bytes */
+	memmove(to, from, n);
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes when EA presents.
+ * Returns 0 on success or negative error number on failure.
+ *
+ */
+int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry, *last, *first;
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_ibody_find *is = NULL;
+	struct ext4_xattr_block_find *bs = NULL;
+	char *buffer = NULL, *b_entry_name = NULL;
+	size_t min_offs, free;
+	int total_ino, total_blk;
+	void *base, *start, *end;
+	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+	int s_min_extra_isize = EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+retry:
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+		up_write(&EXT4_I(inode)->xattr_sem);
+		return 0;
+	}
+
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/*
+	 * Check if enough free space is available in the inode to shift the
+	 * entries ahead by new_extra_isize.
+	 */
+
+	base = start = entry;
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	min_offs = end - base;
+	last = entry;
+	total_ino = sizeof(struct ext4_xattr_ibody_header);
+
+	free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+	if (free >= new_extra_isize) {
+		entry = IFIRST(header);
+		ext4_xattr_shift_entries(entry,	EXT4_I(inode)->i_extra_isize
+				- new_extra_isize, (void *)raw_inode +
+				EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+				(void *)header, total_ino,
+				inode->i_sb->s_blocksize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		error = 0;
+		goto cleanup;
+	}
+
+	/*
+	 * Enough free space isn't available in the inode, check if
+	 * EA block can hold new_extra_isize bytes.
+	 */
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bh)
+			goto cleanup;
+		if (ext4_xattr_check_block(bh)) {
+			ext4_error(inode->i_sb, __FUNCTION__,
+				"inode %lu: bad block %llu", inode->i_ino,
+				EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		base = BHDR(bh);
+		first = BFIRST(bh);
+		end = bh->b_data + bh->b_size;
+		min_offs = end - base;
+		free = ext4_xattr_free_space(first, &min_offs, base,
+					     &total_blk);
+		if (free < new_extra_isize) {
+			if (!tried_min_extra_isize && s_min_extra_isize) {
+				tried_min_extra_isize++;
+				new_extra_isize = s_min_extra_isize;
+				goto retry;
+			}
+			error = -1;
+			goto cleanup;
+		}
+	}
+	else {
+		free = inode->i_sb->s_blocksize;
+	}
+
+	while (new_extra_isize > 0) {
+		size_t offs, size, entry_size;
+		struct ext4_xattr_entry *small_entry = NULL;
+		struct ext4_xattr_info i = {
+			.value = NULL,
+			.value_len = 0,
+		};
+		unsigned int total_size, shift_bytes, temp = ~0U;
+
+		is = (struct ext4_xattr_ibody_find *) kmalloc(sizeof(struct
+					 ext4_xattr_ibody_find), GFP_KERNEL);
+		bs = (struct ext4_xattr_block_find *) kmalloc(sizeof(struct
+					 ext4_xattr_block_find), GFP_KERNEL);
+		memset((void *)is, 0, sizeof(struct ext4_xattr_ibody_find));
+		memset((void *)bs, 0, sizeof(struct ext4_xattr_block_find));
+
+		is->s.not_found = bs->s.not_found = -ENODATA;
+		is->iloc.bh = NULL;
+		bs->bh = NULL;
+
+		last = IFIRST(header);
+		/* Find the entry best suited to be pushed into EA block */
+		entry = NULL;
+		for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+			total_size = EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+	    			    	EXT4_XATTR_LEN(last->e_name_len);
+			if (total_size <= free && total_size < temp) {
+				if (total_size < new_extra_isize) {
+					small_entry = last;
+				}
+				else {
+					entry = last;
+					temp = total_size;
+				}
+			}
+		}
+
+		if (entry == NULL) {
+			if (small_entry) {
+				entry = small_entry;
+			}
+			else {
+				if (!tried_min_extra_isize &&
+				    s_min_extra_isize) {
+					tried_min_extra_isize++;
+					new_extra_isize = s_min_extra_isize;
+					goto retry;
+				}
+				error = -1;
+				goto cleanup;
+			}
+		}
+		offs = le16_to_cpu(entry->e_value_offs);
+		size = le32_to_cpu(entry->e_value_size);
+		entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+		i.name_index = entry->e_name_index,
+		buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
+		b_entry_name = kmalloc(entry->e_name_len + 1, GFP_KERNEL);
+		/* Save the entry name and the entry value */
+		memcpy((void *)buffer, (void *)IFIRST(header) + offs,
+		       EXT4_XATTR_SIZE(size));
+		memcpy((void *)b_entry_name, (void *)entry->e_name,
+		       entry->e_name_len);
+		b_entry_name[entry->e_name_len] = '\0';
+		i.name = b_entry_name;
+
+		error = ext4_get_inode_loc(inode, &is->iloc);
+		if (error)
+			goto cleanup;
+
+		error = ext4_xattr_ibody_find(inode, &i, is);
+		if (error)
+			goto cleanup;
+
+		/* Remove the chosen entry from the inode */
+		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+
+		entry = IFIRST(header);
+		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+			shift_bytes = new_extra_isize;
+		else
+			shift_bytes = entry_size + size;
+		/* Adjust the offsets and shift the remaining entries ahead */
+		ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+			shift_bytes, (void *)raw_inode +
+			EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+			(void *)header, total_ino - entry_size,
+			inode->i_sb->s_blocksize);
+
+		extra_isize += shift_bytes;
+		new_extra_isize -= shift_bytes;
+		EXT4_I(inode)->i_extra_isize = extra_isize;
+
+		i.name = b_entry_name;
+		i.value = buffer;
+		i.value_len = cpu_to_le32(size);
+		error = ext4_xattr_block_find(inode, &i, bs);
+		if (error)
+			goto cleanup;
+
+		/* Add entry which was removed from the inode into the block */
+		error = ext4_xattr_block_set(handle, inode, &i, bs);
+		if (error)
+			goto cleanup;
+	}
+
+cleanup:
+	if (b_entry_name)
+		kfree(b_entry_name);
+	if (buffer)
+		kfree(buffer);
+	if (is) {
+		brelse(is->iloc.bh);
+		kfree(is);
+	}
+	if (bs)
+		kfree(bs);
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+
+
 /*
  * ext4_xattr_delete_inode()
  *
Index: linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs_i.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/ext4_fs_i.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs_i.h
@@ -153,6 +153,13 @@ struct ext4_inode_info {
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
+	struct timespec i_crtime;
+	__u64 i_fs_version;
+
+	__u32 i_blocks_reserved;
+	__u32 i_md_reserved;
+	spinlock_t i_wb_reserved_lock;  /* to protect i_md_reserved */
+	atomic_t i_wb_writers;
 };
 
 #endif	/* _LINUX_EXT4_FS_I */
Index: linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs_sb.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/ext4_fs_sb.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/ext4_fs_sb.h
@@ -24,6 +24,8 @@
 #endif
 #include <linux/rbtree.h>
 
+struct ext4_reservation_slot;
+
 /*
  * third extended-fs super-block data in memory
  */
@@ -65,6 +67,9 @@ struct ext4_sb_info {
 	struct rb_root s_rsv_window_root;
 	struct ext4_reserve_window_node s_rsv_window_head;
 
+	/* global reservation structures */
+	struct ext4_reservation_slot *s_reservation_slots;
+
 	/* Journaling */
 	struct inode * s_journal_inode;
 	struct journal_s * s_journal;
@@ -79,6 +84,7 @@ struct ext4_sb_info {
 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
@@ -89,6 +95,17 @@ struct ext4_sb_info {
 	unsigned long s_ext_blocks;
 	unsigned long s_ext_extents;
 #endif
+
+	atomic_t s_wb_congested;
+	atomic_t s_wb_single_pages;
+	atomic_t s_wb_collisions_sp;
+	atomic_t s_wb_allocated;
+	atomic_t s_wb_reqs;
+	atomic_t s_wb_nr_to_write;
+	atomic_t s_wb_collisions;
+	atomic_t s_wb_blocks;
+	atomic_t s_wb_extents;
+	atomic_t s_wb_dropped;
 };
 
 #endif	/* _LINUX_EXT4_FS_SB */
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/xattr.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/xattr.h
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/xattr.h
@@ -56,6 +56,13 @@ struct ext4_xattr_entry {
 #define EXT4_XATTR_SIZE(size) \
 	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
 
+#define IHDR(inode, raw_inode) \
+	((struct ext4_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		EXT4_GOOD_OLD_INODE_SIZE + \
+		EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
 # ifdef CONFIG_EXT4DEV_FS_XATTR
 
 extern struct xattr_handler ext4_xattr_user_handler;
@@ -74,6 +81,9 @@ extern int ext4_xattr_set_handle(handle_
 extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
 extern void ext4_xattr_put_super(struct super_block *);
 
+extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle);
+
 extern int init_ext4_xattr(void);
 extern void exit_ext4_xattr(void);
 
@@ -129,6 +139,13 @@ exit_ext4_xattr(void)
 {
 }
 
+static inline int
+ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle)
+{
+	return -EOPNOTSUPP;
+}
+
 #define ext4_xattr_handlers	NULL
 
 # endif  /* CONFIG_EXT4DEV_FS_XATTR */
Index: linux-2.6.22-rc4-kamikaze1/include/linux/page-flags.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/page-flags.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/page-flags.h
@@ -89,6 +89,7 @@
 #define PG_mappedtodisk		16	/* Has blocks allocated on-disk */
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_buddy		19	/* Page is free, on buddy lists */
+#define PG_booked		20	/* Has blocks reserved on-disk */
 
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
@@ -221,6 +222,10 @@ static inline void SetPageUptodate(struc
 #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
 #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
 
+#define PageBooked(page)	test_bit(PG_booked, &(page)->flags)
+#define SetPageBooked(page)	set_bit(PG_booked, &(page)->flags)
+#define ClearPageBooked(page)	clear_bit(PG_booked, &(page)->flags)
+
 #define PageReclaim(page)	test_bit(PG_reclaim, &(page)->flags)
 #define SetPageReclaim(page)	set_bit(PG_reclaim, &(page)->flags)
 #define ClearPageReclaim(page)	clear_bit(PG_reclaim, &(page)->flags)
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/balloc.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/balloc.c
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/balloc.c
@@ -630,8 +630,10 @@ void ext4_free_blocks(handle_t *handle, 
 		return;
 	}
 	ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-	if (dquot_freed_blocks)
+	if (dquot_freed_blocks) {
+		ext4_release_blocks(sb, dquot_freed_blocks);
 		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+	}
 	return;
 }
 
@@ -1440,7 +1442,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *h
 	struct ext4_sb_info *sbi;
 	struct ext4_reserve_window_node *my_rsv = NULL;
 	struct ext4_block_alloc_info *block_i;
-	unsigned short windowsz = 0;
+	unsigned short windowsz = 0, reserved = 0;
 #ifdef EXT4FS_DEBUG
 	static int goal_hits, goal_attempts;
 #endif
@@ -1462,6 +1464,13 @@ ext4_fsblk_t ext4_new_blocks(handle_t *h
 		return 0;
 	}
 
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_BLOCKS_RESERVED)) {
+		*errp = ext4_reserve_blocks(sb, num);
+		if (*errp)
+			return 0;
+		reserved = num;
+	}
+
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
 	ext4_debug("goal=%lu.\n", goal);
@@ -1674,8 +1683,11 @@ out:
 	/*
 	 * Undo the block allocation
 	 */
-	if (!performed_allocation)
+	if (!performed_allocation) {
 		DQUOT_FREE_BLOCK(inode, *count);
+		if (reserved)
+			ext4_release_blocks(sb, reserved);
+	}
 	brelse(bitmap_bh);
 	return 0;
 }
@@ -1834,3 +1846,161 @@ unsigned long ext4_bg_num_gdb(struct sup
 	return ext4_bg_num_gdb_meta(sb,group);
 
 }
+
+/*
+ * reservation.c contains routines to reserve blocks.
+ * we need this for delayed allocation, otherwise we
+ * could meet -ENOSPC at flush time
+ */
+
+/*
+ * as ->commit_write() where we're going to reserve
+ * non-allocated-yet blocks is well known hotpath,
+ * we have to make it scalable and avoid global
+ * data as much as possible
+ *
+ * there is per-sb array
+ */
+
+struct ext4_reservation_slot {
+	__u64		rs_reserved;
+	spinlock_t	rs_lock;
+} ____cacheline_aligned;
+
+
+int ext4_reserve_local(struct super_block *sb, int blocks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_reservation_slot *rs;
+	int rc = -ENOSPC;
+
+	preempt_disable();
+	rs = sbi->s_reservation_slots + smp_processor_id();
+
+	spin_lock(&rs->rs_lock);
+	if (likely(rs->rs_reserved >= blocks)) {
+		rs->rs_reserved -= blocks;
+		rc = 0;
+	}
+	spin_unlock(&rs->rs_lock);
+
+	preempt_enable();
+	return rc;
+}
+
+
+void ext4_rebalance_reservation(struct ext4_reservation_slot *rs, __u64 free)
+{
+	int i, used_slots = 0;
+	__u64 chunk;
+
+	/* let's know what slots have been used */
+	for (i = 0; i < NR_CPUS; i++)
+		if (rs[i].rs_reserved || i == smp_processor_id())
+			used_slots++;
+
+	/* chunk is a number of block every used
+	 * slot will get. make sure it isn't 0 */
+	chunk = free + used_slots - 1;
+	do_div(chunk, used_slots);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (free < chunk)
+			chunk = free;
+		if (rs[i].rs_reserved || i == smp_processor_id()) {
+			rs[i].rs_reserved = chunk;
+			free -= chunk;
+			BUG_ON(free < 0);
+		}
+	}
+	BUG_ON(free);
+}
+
+int ext4_reserve_global(struct super_block *sb, int blocks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_reservation_slot *rs;
+	int i, rc = -ENOENT;
+	__u64 free = 0;
+
+	rs = sbi->s_reservation_slots;
+
+	/* lock all slots */
+	for (i = 0; i < NR_CPUS; i++) {
+		spin_lock(&rs[i].rs_lock);
+		free += rs[i].rs_reserved;
+	}
+
+	if (free >= blocks) {
+		free -= blocks;
+		ext4_rebalance_reservation(rs, free);
+		rc = 0;
+	}
+
+	for (i = 0; i < NR_CPUS; i++)
+		spin_unlock(&rs[i].rs_lock);
+
+	return rc;
+}
+
+int ext4_reserve_blocks(struct super_block *sb, int blocks)
+{
+	int ret;
+
+	BUG_ON(blocks <= 0);
+
+	ret = ext4_reserve_local(sb, blocks);
+	if (likely(ret == 0))
+		return 0;
+
+	return ext4_reserve_global(sb, blocks);
+}
+
+void ext4_release_blocks(struct super_block *sb, int blocks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_reservation_slot *rs;
+
+	BUG_ON(blocks <= 0);
+
+	preempt_disable();
+	rs = sbi->s_reservation_slots + smp_processor_id();
+
+	spin_lock(&rs->rs_lock);
+	rs->rs_reserved += blocks;
+	spin_unlock(&rs->rs_lock);
+
+	preempt_enable();
+}
+
+int ext4_reserve_init(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_reservation_slot *rs;
+	int i;
+
+	rs = kmalloc(sizeof(struct ext4_reservation_slot) * NR_CPUS, GFP_KERNEL);
+	if (rs == NULL)
+		return -ENOMEM;
+	sbi->s_reservation_slots = rs;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		spin_lock_init(&rs[i].rs_lock);
+		rs[i].rs_reserved = 0;
+	}
+	rs[0].rs_reserved = percpu_counter_sum(&sbi->s_freeblocks_counter);
+
+	return 0;
+}
+
+void ext4_reserve_release(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_reservation_slot *rs;
+
+	rs = sbi->s_reservation_slots;
+	BUG_ON(sbi->s_reservation_slots == NULL);
+	kfree(sbi->s_reservation_slots);
+	sbi->s_reservation_slots = NULL;
+}
+
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/Makefile
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/ext4/Makefile
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 
 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o
+		   ext4_jbd2.o writeback.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
Index: linux-2.6.22-rc4-kamikaze1/fs/ext4/writeback.c
===================================================================
--- /dev/null
+++ linux-2.6.22-rc4-kamikaze1/fs/ext4/writeback.c
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+/*
+ * TODO:
+ *   MUST:
+ *     - flush dirty pages in -ENOSPC case in order to free reserved blocks
+ *     - direct I/O support
+ *     - blocksize != PAGE_CACHE_SIZE support
+ *     - store last unwritten page in ext4_wb_writepages() and
+ *       continue from it in a next run
+ *   WISH:
+ *     - should ext4_wb_writepage() try to flush neighbours?
+ *     - ext4_wb_block_truncate_page() must flush partial truncated pages
+ *     - reservation can be done per write-request in ext4_file_write()
+ *       rather than per-page in ext4_wb_commit_write() -- it's quite
+ *       expensive to recalculate amount of required metadata for evey page
+ *     - re-allocation to improve layout
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd.h>
+#include <linux/ext4_fs_extents.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/spinlock.h>
+
+/*
+ * If EXT4_WB_STATS is defined, then some stats are collected.
+ * It will be showed upont umount time.
+ */
+#define EXT4_WB_STATS
+
+
+/*
+ * With EXT4_WB_SKIP_SMALL defined the patch will try to avoid
+ * small I/Os ignoring ->writepages() if mapping hasn't enough
+ * contig. dirty pages
+ */
+#define EXT4_WB_SKIP_SMALL__
+
+#define WB_ASSERT(__x__) if (!(__x__)) BUG();
+
+#define WB_DEBUG__
+#ifdef WB_DEBUG
+#define wb_debug(fmt,a...)	printk(fmt, ##a);
+#else
+#define wb_debug(fmt,a...)
+#endif
+
+#define WB_MAX_PAGES_PER_EXTENT	32768
+
+#define WB_PAGES_PER_ARRAY	60
+
+struct ext4_wb_pages {
+	struct list_head list;
+	struct page *pages[WB_PAGES_PER_ARRAY];
+	unsigned short num, start;
+};
+
+struct ext4_wb_control {
+	pgoff_t	start;
+	int len, extents;
+	int blocks_to_release;
+	struct ext4_wb_pages *pages;
+	struct list_head list;
+	struct address_space *mapping;
+};
+
+
+void ext4_wb_invalidatepage(struct page *, unsigned long);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create);
+
+
+static struct page * ext4_wb_pull_page(struct ext4_wb_control *wc)
+{
+	struct ext4_wb_pages *wp = wc->pages;
+
+	BUG_ON(wp == NULL);
+	BUG_ON(list_empty(&wc->list));
+	BUG_ON(list_empty(&wp->list));
+	if (wp->start == wp->num) {
+		list_del(&wp->list);
+		kfree(wp);
+		if (list_empty(&wc->list))
+			return NULL;
+		wp = list_entry(wc->list.next, struct ext4_wb_pages, list);
+		wc->pages = wp;
+	}
+	BUG_ON(list_empty(&wp->list));
+	return wp->pages[wp->start++];
+}
+
+static struct bio * ext4_wb_bio_alloc(struct inode *inode,
+					sector_t first_block, int nr_vecs)
+{
+	gfp_t gfp_flags = GFP_NOFS | __GFP_HIGH;
+	struct bio *bio;
+	int maxreq;
+
+	maxreq = bio_get_nr_vecs(inode->i_sb->s_bdev);
+	if (maxreq < nr_vecs)
+		nr_vecs = maxreq;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_sector = first_block << (inode->i_blkbits - 9);
+	}
+	return bio;
+}
+
+static int ext4_wb_end_io(struct bio *bio, unsigned int bytes, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate)
+			SetPageError(page);
+		end_page_writeback(page);
+	} while (bvec >= bio->bi_io_vec);
+	bio_put(bio);
+	return 0;
+}
+
+static struct bio *ext4_wb_bio_submit(struct bio *bio, handle_t *handle)
+{
+	bio->bi_end_io = ext4_wb_end_io;
+	submit_bio(WRITE, bio);
+	return NULL;
+}
+
+int inline ext4_wb_reserve_space_page(struct page *page, int blocks)
+{
+	struct inode *inode = page->mapping->host;
+	int total, mdb, err;
+
+	wb_debug("reserve %d blocks for page %lu from inode %lu\n",
+			blocks, page->index, inode->i_ino);
+
+	/* user wants us to reserve blocks for his file. reserving space
+	 * for his (data) blocks isn't enough because adding block may
+	 * involve allocation index/leaf blocks for tree/blockmap.
+	 * so, we need to calculate numbers of needed metadata for worst
+	 * case: block per extent */
+
+	spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
+	total = EXT4_I(inode)->i_blocks_reserved + blocks;
+	mdb = ext4_ext_calc_metadata_amount(inode, total);
+
+	/* if blockmap needs more metadata, we have to reserve difference */
+	BUG_ON(mdb < EXT4_I(inode)->i_md_reserved);
+	mdb = mdb - EXT4_I(inode)->i_md_reserved;
+
+	err = ext4_reserve_blocks(inode->i_sb, mdb + blocks);
+	if (err) {
+		/* blocks are exhausted? */
+		spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+		return err;
+	}
+
+	/* blocks have been reserved, account this. I believe
+	 * inode's fields are protected by inode->i_sem */
+	EXT4_I(inode)->i_blocks_reserved += blocks;
+	EXT4_I(inode)->i_md_reserved += mdb;
+	spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+	/* we have reserved space on a disk for the page */
+	SetPageBooked(page);
+	return 0;
+}
+
+/*
+ * release space reserved for @blocks of data
+ * @used signals that @blocks got really allocated and we just
+ * need to release corresponded over-reserved metadata
+ */
+int inline ext4_wb_release_space(struct inode *inode, int blocks, int used)
+{
+	int total, mdb, release;
+
+	spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+	total = EXT4_I(inode)->i_blocks_reserved - blocks;
+	mdb = ext4_ext_calc_metadata_amount(inode, total);
+
+	/* if blockmap needs lesser metadata, we may release difference */
+	BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
+	mdb = EXT4_I(inode)->i_md_reserved - mdb;
+
+	release = mdb;
+	/* drop reservation only for non-used blocks */
+	if (!used)
+		release += blocks;
+	wb_debug("%u %s: release %d/%d blocks from %u/%u reserved for inode %lu\n",
+			blocks, used ? "allocated" : "dropped", used ? 0 : blocks,
+			mdb, EXT4_I(inode)->i_blocks_reserved,
+			EXT4_I(inode)->i_md_reserved, inode->i_ino);
+	if (release)
+		ext4_release_blocks(inode->i_sb, release);
+
+	/* update per-inode reservations */
+	BUG_ON(blocks > EXT4_I(inode)->i_blocks_reserved);
+	EXT4_I(inode)->i_blocks_reserved -= blocks;
+	BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
+	EXT4_I(inode)->i_md_reserved -= mdb;
+
+	spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+	return 0;
+}
+
+static inline int ext4_wb_drop_page_reservation(struct page *page)
+{
+	/* we just allocated blocks for this page. those blocks (and
+	 * probably metadata for them) were reserved before. now we
+	 * should drop reservation mark from the page. if we didn't
+	 * do that then ->invalidatepage() may think page still holds
+	 * reserved blocks. we could release reserved blocks right
+	 * now, but I'd prefer to make this once per several blocks */
+	wb_debug("drop reservation from page %lu from inode %lu\n",
+			page->index, page->mapping->host->i_ino);
+	BUG_ON(!PageBooked(page));
+	ClearPageBooked(page);
+	return 0;
+}
+
+static int ext4_wb_submit_extent(struct ext4_wb_control *wc, handle_t *handle,
+					struct ext4_extent *ex, int new)
+{
+	struct inode *inode = wc->mapping->host;
+	int blkbits = inode->i_blkbits;
+	struct page *page;
+	ext4_fsblk_t off;
+	unsigned long blk, len, remain;
+	unsigned long pstart, plen, prev;
+	struct bio *bio = NULL;
+	int nr_pages;
+
+	/*
+	 * we have list of pages in wc and block numbers in ex
+	 * let's cook bios from them and start real I/O
+	 */
+
+	BUG_ON(PAGE_CACHE_SHIFT < blkbits);
+	BUG_ON(list_empty(&wc->list));
+
+	wb_debug("cook and submit bios for %u/%u/%u for %lu/%u\n",
+		 le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len),
+		 le32_to_cpu(ex->ee_start), wc->start, wc->len);
+
+	blk = le32_to_cpu(ex->ee_block);
+	remain = le16_to_cpu(ex->ee_len);
+	wc->extents++;
+
+	while (remain) {
+		page = ext4_wb_pull_page(wc);
+		if (page == NULL)
+			break;
+
+		pstart = page->index << (PAGE_CACHE_SHIFT - blkbits);
+		plen = PAGE_SIZE >> blkbits;
+		if (pstart > blk) {
+			/* probably extent covers long space and page
+			 * to be written in the middle of it */
+			BUG_ON(pstart - blk >= remain);
+			remain -= pstart - blk;
+			blk = pstart;
+		}
+		BUG_ON(blk < pstart || blk >= pstart + plen);
+
+		BUG_ON(!PageUptodate(page));
+		/* page can get here via mmap(2)
+		 * BUG_ON(!PagePrivate(page));*/
+		BUG_ON(new && PageMappedToDisk(page));
+		BUG_ON(!new && !PageMappedToDisk(page));
+		SetPageMappedToDisk(page);
+		if (new && PagePrivate(page)) {
+			/* space is just allocated and it was reserved in
+			 * ->commit_write(). time to release reservation.
+			 * space may not be reserved if page gets dirty
+			 * via mmap. should we reserve it in ->mmap() ? */
+			prev = min(plen, remain);
+			ext4_wb_drop_page_reservation(page);
+			wc->blocks_to_release += prev;
+		}
+
+alloc_new_bio:
+		if (bio == NULL) {
+			/* +2 because head/tail may belong to different pages */
+			nr_pages = (le16_to_cpu(ex->ee_len) -
+				    (blk - le32_to_cpu(ex->ee_block)));
+			nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
+			off = le32_to_cpu(ex->ee_start) +
+			      (blk - le32_to_cpu(ex->ee_block));
+			off |= (ext4_fsblk_t)
+				le16_to_cpu(ex->ee_start_hi) << 32;
+			bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
+			if (bio == NULL)
+				return -ENOMEM;
+		}
+
+		off = (blk - pstart) << blkbits;
+		prev = min(plen, remain);
+		len = prev << blkbits;
+		if (bio_add_page(bio, page, len, off) < len) {
+			bio = ext4_wb_bio_submit(bio, handle);
+			goto alloc_new_bio;
+		}
+		remain -= prev;
+		blk += prev;
+		if (blk < pstart + plen) {
+			/* extent covers part of the page only.
+			 * it's possible that next extent covers
+			 * the tail. so, we leave page */
+			printk("blk %lu pstart %lu plen %lu remain %lu prev %lu\n",
+				blk, pstart, plen, remain, prev);
+			wc->pages->start--;
+			BUG_ON(remain != 0);
+		}
+	}
+	if (bio)
+		ext4_wb_bio_submit(bio, handle);
+	BUG_ON(new && remain != 0);
+	return 0;
+}
+
+static ext4_fsblk_t
+ext4_wb_find_goal(struct inode *inode, struct ext4_ext_path *path,
+			ext4_fsblk_t block)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_fsblk_t bg_start;
+	unsigned long colour;
+	int depth;
+
+	if (path) {
+		struct ext4_extent *ex;
+		depth = path->p_depth;
+
+		/* try to predict block placement */
+		if ((ex = path[depth].p_ext))
+			return (le32_to_cpu(ex->ee_start) |
+				((ext4_fsblk_t)
+				 le16_to_cpu(ex->ee_start_hi) << 32)) +
+			       (block - le32_to_cpu(ex->ee_block));
+
+		/* it looks index is empty
+		 * try to find starting from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour + block;
+}
+
+static int ext4_wb_handle_extent(struct inode *inode,
+					struct ext4_ext_path *path,
+					struct ext4_ext_cache *ec,
+					void *cbdata)
+{
+	struct ext4_wb_control *wc = cbdata;
+	struct super_block *sb = inode->i_sb;
+	ext4_fsblk_t goal, pblock;
+	unsigned long tgen, count;
+	struct ext4_extent nex;
+	loff_t new_i_size;
+	handle_t *handle;
+	int i, err;
+
+	if (ec->ec_type == EXT4_EXT_CACHE_EXTENT) {
+		/*
+		 * The extent is already allocated. The only thing
+		 * we have to do is to flush correspondend pages.
+		 */
+		wb_debug("extent %u/%u/%u exist\n",
+				(unsigned) ec->ec_block,
+				(unsigned) ec->ec_len,
+				(unsigned) ec->ec_start);
+		nex.ee_start = cpu_to_le32(ec->ec_start & 0xffffffff);
+		nex.ee_start_hi = cpu_to_le16((ec->ec_start >> 32) & 0xffff);
+		nex.ee_block = cpu_to_le32(ec->ec_block);
+		nex.ee_len = cpu_to_le16(ec->ec_len);
+		err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
+
+		/* correct on-disk size, if we grow within
+		 * already allocated block */
+		new_i_size = (loff_t) le32_to_cpu(nex.ee_block) +
+			     le16_to_cpu(nex.ee_len);
+		new_i_size = new_i_size << inode->i_blkbits;
+		if (new_i_size > i_size_read(inode))
+			new_i_size = i_size_read(inode);
+		if (new_i_size > EXT4_I(inode)->i_disksize) {
+			EXT4_I(inode)->i_disksize = new_i_size;
+			mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+			ext4_dirty_inode(inode);
+			mutex_lock(&EXT4_I(inode)->truncate_mutex);
+		}
+		return err;
+	}
+
+	wb_debug("extent %u/%u DOES NOT exist\n", ec->ec_block, ec->ec_len);
+
+	/* space for some pages we want to flush hasn't allocated
+	 * yet. so, it's time to allocate space */
+	tgen = EXT4_I(inode)->i_ext_generation;
+	count = ext4_ext_calc_credits_for_insert(inode, path);
+	mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+	handle = ext4_journal_start(inode, count + EXT4_DATA_TRANS_BLOCKS(sb) + 1);
+	if (IS_ERR(handle)) {
+		mutex_lock(&EXT4_I(inode)->truncate_mutex);
+		return PTR_ERR(handle);
+	}
+
+	/* FIXME: we could analyze current path and advice allocator
+	 * to find additional blocks if goal can't be allocated
+	 * this is for better interaction between extents and mballoc
+	 * plus this should improve overall performance */
+
+	mutex_lock(&EXT4_I(inode)->truncate_mutex);
+	if (tgen != EXT4_I(inode)->i_ext_generation) {
+		/* the tree has changed. so path can be invalid at moment */
+		ext4_journal_stop(handle);
+		return EXT_REPEAT;
+	}
+
+	goal = ext4_wb_find_goal(inode, path, ec->ec_block);
+	count = ec->ec_len;
+
+	/* if this is a tail of closed file, ask allocator don't preallocate */
+	new_i_size = i_size_read(inode) + sb->s_blocksize - 1;
+	new_i_size = new_i_size >> inode->i_blkbits;
+	if (ec->ec_block + count == new_i_size &&
+			!atomic_read(&inode->i_writecount)) {
+		/* XXX: disable preallocation for tail */
+	}
+
+	/* this is a hack to tell the allocator that blocks
+	 * we are going to allocated are already reserved */
+	EXT4_I(inode)->i_state |= EXT4_STATE_BLOCKS_RESERVED;
+	pblock = ext4_new_blocks(handle, inode, goal, &count, &err);
+	EXT4_I(inode)->i_state &= ~EXT4_STATE_BLOCKS_RESERVED;
+
+	if (!pblock)
+		goto out;
+
+	BUG_ON(count > ec->ec_len);
+	BUG_ON(count == 0);
+	wb_debug("allocated %llu/%lu for %lu (asked %u)\n",
+			pblock, count, inode->i_ino, ec->ec_len);
+
+	/* insert new extent */
+	nex.ee_start = cpu_to_le32(pblock & 0xffffffff);
+	nex.ee_start_hi = cpu_to_le16((pblock >> 32) & 0xffff);
+	nex.ee_len = cpu_to_le16(count);
+	nex.ee_block = cpu_to_le32(ec->ec_block);
+	err = ext4_ext_insert_extent(handle, inode, path, &nex);
+	if (err)
+		goto out;
+
+	/*
+	 * Putting len of the actual extent we just inserted,
+	 * we are asking ext4_ext_walk_space() to continue
+	 * scaning after that block
+	 */
+	ec->ec_len = le16_to_cpu(nex.ee_len);
+	BUG_ON(nex.ee_len == 0);
+
+#ifdef EXT4_WB_STATS
+	atomic_add(le16_to_cpu(nex.ee_len),
+		   &EXT4_SB(inode->i_sb)->s_wb_allocated);
+#endif
+
+	wb_debug("inserted %lu/%lu/%lu for %lu (asked %u)\n",
+		(unsigned long) le32_to_cpu(nex.ee_block),
+		(unsigned long) le16_to_cpu(nex.ee_len),
+		(unsigned long) le32_to_cpu(nex.ee_start),
+		inode->i_ino, ec->ec_len);
+
+	/*
+	 * Important! The nex can change after insert. So do not
+	 * use ec for following
+	 */
+
+	/* block have been allocated for data, so time to drop dirty
+	 * in correspondend buffer_heads to prevent corruptions */
+	for (i = 0; i < le16_to_cpu(nex.ee_len); i++)
+		unmap_underlying_metadata(sb->s_bdev,
+					  ((ext4_fsblk_t)
+					   le16_to_cpu(nex.ee_start_hi) << 32) +
+					   le32_to_cpu(nex.ee_start) + i);
+
+	/* correct on-disk inode size */
+	if (le16_to_cpu(nex.ee_len) > 0) {
+		new_i_size = (loff_t) le32_to_cpu(nex.ee_block) +
+			     le16_to_cpu(nex.ee_len);
+		new_i_size = new_i_size << inode->i_blkbits;
+		if (new_i_size > i_size_read(inode))
+			new_i_size = i_size_read(inode);
+		if (new_i_size > EXT4_I(inode)->i_disksize) {
+			EXT4_I(inode)->i_disksize = new_i_size;
+			err = ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+
+	if (ext4_should_order_data(inode))
+		err = ext4_wb_submit_extent(wc, handle, &nex, 1);
+	else
+		err = ext4_wb_submit_extent(wc, NULL, &nex, 1);
+
+	/* we don't want to recalculate needed reservation for
+	 * each page. we may do this for each new extent */
+	ext4_wb_release_space(inode, wc->blocks_to_release, 1);
+	wc->blocks_to_release = 0;
+
+out:
+	ext4_journal_stop(handle);
+	if (err)
+		printk("EXT4-fs: writeback error = %d\n", err);
+	return err;
+}
+
+static int ext4_wb_flush(struct ext4_wb_control *wc)
+{
+	struct list_head *cur, *tmp;
+	struct inode *inode;
+	int err, num = 0;
+
+	if (wc->len == 0)
+		return 0;
+
+	inode = wc->mapping->host;
+	wb_debug("start flushing %lu/%u from inode %lu\n",
+			wc->start, wc->len, inode->i_ino);
+
+	wc->pages = list_entry(wc->list.next, struct ext4_wb_pages, list);
+	wc->extents = 0;
+
+	mutex_lock(&EXT4_I(inode)->truncate_mutex);
+	/* FIXME: last page may be partial */
+	err = ext4_ext_walk_space(inode, wc->start, wc->len,
+					ext4_wb_handle_extent, wc);
+	mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+	list_for_each_safe(cur, tmp, &wc->list) {
+		struct ext4_wb_pages *wp;
+		wp = list_entry(cur, struct ext4_wb_pages, list);
+		if (err) {
+			while (wp->start < wp->num) {
+				struct page *page = wp->pages[wp->start];
+				BUG_ON(!PageWriteback(page));
+				end_page_writeback(page);
+				__set_page_dirty_nobuffers(page);
+				wp->start++;
+			}
+		} else {
+			BUG_ON(num != 0);
+			BUG_ON(wp->start != wp->num - 1 &&
+					wp->start != wp->num);
+		}
+		list_del(&wp->list);
+		kfree(wp);
+		num++;
+	}
+	wc->pages = NULL;
+	wc->len = 0;
+	wc->extents = 0;
+
+	return err;
+}
+
+static int ext4_wb_add_page(struct ext4_wb_control *wc, struct page *page)
+{
+	struct ext4_wb_pages * wp = wc->pages;
+
+	if (wp == NULL || wp->num == WB_PAGES_PER_ARRAY) {
+		wp = kmalloc(sizeof(struct ext4_wb_pages), GFP_NOFS);
+		if (wp == NULL) {
+			printk("no mem for ext4_wb_pages!\n");
+			return -ENOMEM;
+		}
+		wp->num = 0;
+		wp->start = 0;
+		list_add_tail(&wp->list, &wc->list);
+		wc->pages = wp;
+	}
+
+	wp->pages[wp->num] = page;
+	wp->num++;
+
+	return 0;
+}
+
+static inline void
+ext4_wb_init_control(struct ext4_wb_control *wc, struct address_space *mapping)
+{
+	wc->mapping = mapping;
+	wc->len = 0;
+	wc->blocks_to_release = 0;
+	INIT_LIST_HEAD(&wc->list);
+	wc->pages = NULL;
+}
+
+static inline int
+ext4_wb_can_merge(struct ext4_wb_control *wc, unsigned long next)
+{
+	if (wc->start + wc->len == next &&
+			wc->len <= WB_MAX_PAGES_PER_EXTENT)
+		return 1;
+	return 0;
+}
+
+int ext4_wb_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct inode *inode = mapping->host;
+	int nr_pages, i, err = 0, done = 0;
+	struct ext4_wb_control wc;
+	struct pagevec pvec;
+	pgoff_t index = 0;
+	int written = 0;
+	int extents = 0;
+	pgoff_t pindex = 0;
+
+	wb_debug("->writepages on inode %lu (%u reserved)\n",
+		inode->i_ino, EXT4_I(inode)->i_blocks_reserved);
+#ifdef EXT4_WB_SKIP_SMALL
+	if (wbc->nr_to_write <= 64 && wbc->sync_mode == WB_SYNC_NONE)
+		return 0;
+#endif
+	atomic_inc(&EXT4_I(inode)->i_wb_writers);
+#ifdef EXT4_WB_STATS
+	atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_reqs);
+	atomic_add(wbc->nr_to_write, &EXT4_SB(inode->i_sb)->s_wb_nr_to_write);
+	if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
+		atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions);
+#endif
+
+	/* skip opened-for-write small files
+	 * XXX: what do we do if most of files hit the condition? */
+	if (wbc->sync_mode == WB_SYNC_NONE &&
+			atomic_read(&inode->i_writecount) &&
+			i_size_read(inode) <= 64*1024) {
+		return 0;
+	}
+
+	ext4_wb_init_control(&wc, mapping);
+
+	pagevec_init(&pvec, 0);
+	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (page->mapping != mapping) {
+				unlock_page(page);
+				continue;
+			}
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (wc.len && ext4_wb_can_merge(&wc, page->index) &&
+					wbc->nr_to_write <= 0) {
+				/*
+				 * If we already exhausted blocks we got
+				 * to write and new extent starts, stop
+				 * writeback
+				 */
+				unlock_page(page);
+				done = 1;
+				break;
+
+			}
+
+			if (!clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			set_page_writeback(page);
+			unlock_page(page);
+
+			if (wc.len == 0) {
+				wc.start = page->index;
+				wc.len = 1;
+				extents++;
+			} else if (ext4_wb_can_merge(&wc, page->index)) {
+				wc.len++;
+			} else {
+				/* end of current extent: flush it ... */
+#if 0
+				if (wc.len < 64 && wc.len > 0) {
+					printk("#%u: wow! short extent %d for flush on #%lu\n",
+						(unsigned) current->pid, wc.len, inode->i_ino);
+					printk("#%u: done = %d, nr_to_write %ld, sync = %d\n",
+						(unsigned) current->pid, done, wbc->nr_to_write,
+						wbc->sync_mode);
+					printk("#%u: written %d, extents %d\n",
+						(unsigned) current->pid, written, extents);
+					printk("#%u: cur %lu, prev %lu\n",
+						(unsigned) current->pid,
+						(unsigned long) page->index,
+						(unsigned long) pindex);
+				}
+#endif
+				err = ext4_wb_flush(&wc);
+				if (err) {
+					done = 1;
+					end_page_writeback(page);
+					break;
+				}
+
+				/* ... and start new one */
+				BUG_ON(!PageWriteback(page));
+				wc.start = page->index;
+				wc.len = 1;
+				extents++;
+			}
+
+			pindex = page->index;
+			err = ext4_wb_add_page(&wc, page);
+			if (err) {
+				done = 1;
+				end_page_writeback(page);
+				break;
+			}
+			written++;
+
+			wbc->nr_to_write--;
+#if 0
+			if ((--(wbc->nr_to_write) <= 0))
+				done = 1;
+#endif
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+#ifdef EXT4_WB_STATS
+				atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_congested);
+#endif
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+	}
+	if (!err) {
+#ifdef EXT4_WB_SKIP_SMALL
+		if (wc.len > 0 && wc.len < 64 && wbc->sync_mode == WB_SYNC_NONE) {
+			struct list_head *cur, *tmp;
+			list_for_each_safe(cur, tmp, &wc.list) {
+				struct ext4_wb_pages *wp;
+				wp = list_entry(cur, struct ext4_wb_pages, list);
+				for (i = wp->start; i < wp->num; i++) {
+					struct page *page = wp->pages[i];
+					BUG_ON(!PageWriteback(page));
+					end_page_writeback(page);
+					__set_page_dirty_nobuffers(page);
+				}
+				wbc->nr_to_write += i;
+				list_del(&wp->list);
+				kfree(wp);
+			}
+		} else
+#endif
+			ext4_wb_flush(&wc);
+	}
+
+	atomic_dec(&EXT4_I(inode)->i_wb_writers);
+
+#ifdef EXT4_WB_STATS
+	atomic_add(written, &EXT4_SB(inode->i_sb)->s_wb_blocks);
+	atomic_add(extents, &EXT4_SB(inode->i_sb)->s_wb_extents);
+#endif
+	return 0;
+}
+
+static void ext4_wb_clear_page(struct page *page, int from, int to)
+{
+	void *kaddr;
+
+	if (to < PAGE_CACHE_SIZE || from > 0) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		if (PAGE_CACHE_SIZE > to)
+			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+		if (0 < from)
+			memset(kaddr, 0, from);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+}
+
+int ext4_wb_prepare_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head bh, *bhw = &bh;
+	int err = 0;
+
+	wb_debug("prepare page %lu (%u-%u) for inode %lu\n",
+			page->index, from, to, page->mapping->host->i_ino);
+
+	/* if page is uptodate this means that ->prepare_write() has
+	 * been called on page before and page is mapped to disk or
+	 * we did reservation. page is protected and nobody can
+	 * access it. hence, it safe to use page->private to pass
+	 * flag that ->commit_write() has to reserve blocks. because
+	 * an error may occur after ->prepare_write() we should not
+	 * reserve block here. it's better to do in ->commit_write()
+	 * when we're sure page is to be written */
+	page->private = 0;
+	if (!PageUptodate(page)) {
+		/* first write to this page */
+		bh.b_state = 0;
+		err = ext4_get_block(inode, page->index, bhw, 0);
+		if (err)
+			return err;
+		if (!buffer_mapped(bhw)) {
+			/* this block isn't allocated yet, reserve space */
+			wb_debug("reserve space for new block\n");
+			page->private = 1;
+			ext4_wb_clear_page(page, from, to);
+			ClearPageMappedToDisk(page);
+		} else {
+			/* block is already mapped, so no need to reserve */
+			BUG_ON(PagePrivate(page));
+			if (to - from < PAGE_CACHE_SIZE) {
+				wb_debug("read block %u\n",
+						(unsigned) bhw->b_blocknr);
+				set_bh_page(bhw, page, 0);
+				bhw->b_this_page = NULL;
+				bhw->b_size = 1 << inode->i_blkbits;
+				atomic_set(&bhw->b_count, 1);
+				ll_rw_block(READ, 1, &bhw);
+				wait_on_buffer(bhw);
+				if (!buffer_uptodate(bhw))
+					return -EIO;
+			}
+			SetPageMappedToDisk(page);
+		}
+	} else if (!PageMappedToDisk(page) && !PagePrivate(page)) {
+		/* this page was a hole at time of mmap() calling
+		 * now someone wants to modify it by sys_write() */
+		wb_debug("reserve block for hole\n");
+		page->private = 1;
+	}
+
+	return 0;
+}
+
+int ext4_wb_commit_write(struct file *file, struct page *page,
+			     unsigned from, unsigned to)
+{
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	struct inode *inode = page->mapping->host;
+	int err = 0;
+
+	wb_debug("commit page %lu (%u-%u) for inode %lu\n",
+			page->index, from, to, inode->i_ino);
+
+	/* mark page private so that we get
+	 * called to invalidate/release page */
+	SetPagePrivate(page);
+
+	if (!PageBooked(page) && !PageMappedToDisk(page)) {
+		/* ->prepare_write() observed that block for this
+		 * page hasn't been allocated yet. there fore it
+		 * asked to reserve block for later allocation */
+		BUG_ON(page->private == 0);
+		page->private = 0;
+		err = ext4_wb_reserve_space_page(page, 1);
+		if (err)
+			return err;
+	}
+
+	/* ok. block for this page is allocated already or it has
+	 * been reserved succesfully. so, user may use it */
+	__set_page_dirty_nobuffers(page);
+
+	SetPageUptodate(page);
+
+	/* correct in-core size,  on-disk size will
+	 * be corrected upon allocation */
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+
+	return err;
+}
+
+int ext4_wb_write_single_page(struct page *page,
+					struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct ext4_wb_control wc;
+	int err;
+
+	atomic_inc(&EXT4_I(inode)->i_wb_writers);
+
+#ifdef EXT4_WB_STATS
+	atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_single_pages);
+	if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
+		atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions_sp);
+#endif
+
+	ext4_wb_init_control(&wc, page->mapping);
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	wc.start = page->index;
+	wc.len = 1;
+
+	err = ext4_wb_add_page(&wc, page);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: cant add page at %s:%d - %d\n",
+				__FILE__, __LINE__, err);
+		end_page_writeback(page);
+		return err;
+	}
+	err = ext4_wb_flush(&wc);
+	atomic_dec(&EXT4_I(inode)->i_wb_writers);
+
+	return err;
+}
+
+int ext4_wb_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	void *kaddr;
+
+	wb_debug("writepage %lu from inode %lu\n", page->index, inode->i_ino);
+
+	/*
+	 * FIXME: just to play ...
+	 * If another thread is writing inode's data and the page
+	 * hasn't space on a disk yet, leave it for that thread
+	 */
+#if 1
+	if (atomic_read(&EXT4_I(page->mapping->host)->i_wb_writers)
+			&& !PageMappedToDisk(page)) {
+		__set_page_dirty_nobuffers(page);
+		unlock_page(page);
+		return 0;
+	}
+#endif
+
+	/* we give up here if we're reentered, because
+	 * it might be for a different filesystem  */
+	if (ext4_journal_current_handle()) {
+		__set_page_dirty_nobuffers(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		return ext4_wb_write_single_page(page, wbc);
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index + 1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext4_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		ext4_wb_invalidatepage(page, 0);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	return ext4_wb_write_single_page(page, wbc);
+}
+
+int ext4_wb_releasepage(struct page *page, gfp_t wait)
+{
+	wb_debug("release %sM%sR page %lu from inode %lu (wait %d)\n",
+			PageMappedToDisk(page) ? "" : "!",
+			PageBooked(page) ? "" : "!",
+			page->index, page->mapping->host->i_ino, wait);
+
+	if (PageWriteback(page))
+		return 0;
+
+	if (PagePrivate(page))
+		ClearPagePrivate(page);
+	return 0;
+}
+
+void ext4_wb_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = 0;
+
+	/* ->invalidatepage() is called when page is marked Private.
+	 * for our page being Private mean that space has been
+	 * reserved for this page and it is being truncated. so,
+	 * it's time to drop reservation */
+	wb_debug("invalidate %sM%sR page %lu from inode %lu (offset %lu)\n",
+			PageMappedToDisk(page) ? "" : "!",
+			PageBooked(page) ? "" : "!",
+			page->index, inode->i_ino, offset);
+
+	if (offset == 0) {
+		if (PageBooked(page)) {
+			atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_dropped);
+			ext4_wb_release_space(inode, 1, 0);
+			ext4_wb_drop_page_reservation(page);
+		}
+		ret = try_to_release_page(page, 0);
+	}
+	return;
+}
+
+int ext4_wb_block_truncate_page(handle_t *handle, struct page *page,
+				struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	struct inode *inode = mapping->host;
+	struct buffer_head bh, *bhw = &bh;
+	unsigned blocksize, length;
+	void *kaddr;
+	int err = 0;
+
+	wb_debug("partial truncate from %lu on page %lu from inode %lu\n",
+			(unsigned long) from, page->index, inode->i_ino);
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	/* if page isn't uptodate we have to check has it assigned block
+	 * if it has then that block is to be read before memset() */
+	if (!PageUptodate(page)) {
+		BUG_ON(PageMappedToDisk(page));
+		bh.b_state = 0;
+		err = ext4_get_block(inode, page->index, bhw, 0);
+		if (err)
+			goto err_out;
+		BUG_ON(buffer_new(bhw));
+		if (buffer_mapped(bhw)) {
+			/* time to retrieve data from a disk */
+			wb_debug("read block %u for part.trunc on %lu\n",
+					(unsigned) bhw->b_blocknr, page->index);
+			set_bh_page(bhw, page, 0);
+			bhw->b_this_page = NULL;
+			bhw->b_size = 1 << inode->i_blkbits;
+			atomic_set(&bhw->b_count, 1);
+			ll_rw_block(READ, 1, &bhw);
+			wait_on_buffer(bhw);
+			err = -EIO;
+			if (!buffer_uptodate(bhw))
+				goto err_out;
+			SetPageMappedToDisk(page);
+		} else {
+			wb_debug("zero page %lu (part.trunc)\n", page->index);
+			offset = 0;
+			length = blocksize;
+		}
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, length);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	SetPageUptodate(page);
+	__set_page_dirty_nobuffers(page);
+
+err_out:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+void ext4_wb_init(struct super_block *sb)
+{
+	if (!test_opt(sb, DELAYED_ALLOC))
+		return;
+
+	if (PAGE_CACHE_SHIFT != sb->s_blocksize_bits) {
+		printk(KERN_ERR "EXT4-fs: delayed allocation isn't"
+			"supported for PAGE_CACHE_SIZE != blocksize yet\n");
+		clear_opt (EXT4_SB(sb)->s_mount_opt, DELAYED_ALLOC);
+		return;
+	}
+	printk("EXT4-fs: delayed allocation enabled\n");
+}
+
+void ext4_wb_release(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!test_opt(sb, DELAYED_ALLOC))
+		return;
+
+#ifdef EXT4_WB_STATS
+	if (atomic_read(&sbi->s_wb_reqs) == 0)
+		return;
+
+	printk("EXT4-fs: writeback: %d blocks %d extents in %d reqs (%d ave)\n",
+		atomic_read(&sbi->s_wb_blocks),
+		atomic_read(&sbi->s_wb_extents),
+		atomic_read(&sbi->s_wb_reqs),
+		atomic_read(&sbi->s_wb_blocks) / atomic_read(&sbi->s_wb_reqs));
+	printk("EXT4-fs: writeback: %d nr_to_write, %d congestions, %d singles\n",
+		atomic_read(&sbi->s_wb_nr_to_write),
+		atomic_read(&sbi->s_wb_congested),
+		atomic_read(&sbi->s_wb_single_pages));
+	printk("EXT4-fs: writeback: %d collisions, %d single-page collisions\n",
+		atomic_read(&sbi->s_wb_collisions),
+		atomic_read(&sbi->s_wb_collisions_sp));
+	printk("EXT4-fs: writeback: %d allocated, %d dropped\n",
+		atomic_read(&sbi->s_wb_allocated),
+		atomic_read(&sbi->s_wb_dropped));
+#endif
+}
+
Index: linux-2.6.22-rc4-kamikaze1/include/linux/jbd2.h
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/include/linux/jbd2.h
+++ linux-2.6.22-rc4-kamikaze1/include/linux/jbd2.h
@@ -408,6 +408,16 @@ struct handle_s
 };
 
 
+/*
+ * Some stats for checkpoint phase
+ */
+struct transaction_chp_stats_s {
+	unsigned long		cs_chp_time;
+	unsigned long		cs_forced_to_close;
+	unsigned long		cs_written;
+	unsigned long		cs_dropped;
+};
+
 /* The transaction_t type is the guts of the journaling mechanism.  It
  * tracks a compound transaction through its various states:
  *
@@ -543,6 +553,21 @@ struct transaction_s
 	spinlock_t		t_handle_lock;
 
 	/*
+	 * Longest time some handle had to wait for running transaction
+	 */
+	unsigned long		t_max_wait;
+
+	/*
+	 * When transaction started
+	 */
+	unsigned long		t_start;
+
+	/*
+	 * Checkpointing stats [j_checkpoint_sem]
+	 */
+	struct transaction_chp_stats_s t_chp_stats;
+
+	/*
 	 * Number of outstanding updates running on this transaction
 	 * [t_handle_lock]
 	 */
@@ -573,6 +598,57 @@ struct transaction_s
 
 };
 
+struct transaction_run_stats_s {
+	unsigned long		rs_wait;
+	unsigned long		rs_running;
+	unsigned long		rs_locked;
+	unsigned long		rs_flushing;
+	unsigned long		rs_logging;
+
+	unsigned long		rs_handle_count;
+	unsigned long		rs_blocks;
+	unsigned long		rs_blocks_logged;
+};
+
+struct transaction_stats_s
+{
+	int 			ts_type;
+	unsigned long		ts_tid;
+	union {
+		struct transaction_run_stats_s run;
+		struct transaction_chp_stats_s chp;
+	} u;
+};
+
+#define JBD2_STATS_RUN		1
+#define JBD2_STATS_CHECKPOINT	2
+
+#define ts_wait			u.run.rs_wait
+#define ts_running		u.run.rs_running
+#define ts_locked		u.run.rs_locked
+#define ts_flushing		u.run.rs_flushing
+#define ts_logging		u.run.rs_logging
+#define ts_handle_count		u.run.rs_handle_count
+#define ts_blocks		u.run.rs_blocks
+#define ts_blocks_logged	u.run.rs_blocks_logged
+
+#define ts_chp_time		u.chp.cs_chp_time
+#define ts_forced_to_close	u.chp.cs_forced_to_close
+#define ts_written		u.chp.cs_written
+#define ts_dropped		u.chp.cs_dropped
+
+#define CURRENT_MSECS		(jiffies_to_msecs(jiffies))
+
+static inline unsigned int
+jbd2_time_diff(unsigned int start, unsigned int end)
+{
+	if (unlikely(start > end))
+		end = end + (~0UL - start);
+	else
+		end -= start;
+	return end;
+}
+
 /**
  * struct journal_s - The journal_s type is the concrete type associated with
  *     journal_t.
@@ -634,6 +710,12 @@ struct transaction_s
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *	number that will fit in j_blocksize
  * @j_last_sync_writer: most recent pid which did a synchronous write
+ * @j_history: Buffer storing the transactions statistics history
+ * @j_history_max: Maximum number of transactions in the statistics history
+ * @j_history_cur: Current number of transactions in the statistics history
+ * @j_history_lock: Protect the transactions statistics history
+ * @j_proc_entry: procfs entry for the jbd statistics directory
+ * @j_stats: Overall statistics
  * @j_private: An opaque pointer to fs-private information.
  */
 
@@ -826,6 +908,16 @@ struct journal_s
 	pid_t			j_last_sync_writer;
 
 	/*
+	 * Journal statistics
+	 */
+	struct transaction_stats_s *j_history;
+	int			j_history_max;
+	int			j_history_cur;
+	spinlock_t		j_history_lock;
+	struct proc_dir_entry	*j_proc_entry;
+	struct transaction_stats_s j_stats;
+
+	/*
 	 * An opaque pointer to fs-private information.  ext3 puts its
 	 * superblock pointer here
 	 */
Index: linux-2.6.22-rc4-kamikaze1/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/jbd2/transaction.c
+++ linux-2.6.22-rc4-kamikaze1/fs/jbd2/transaction.c
@@ -59,6 +59,8 @@ jbd2_get_transaction(journal_t *journal,
 
 	J_ASSERT(journal->j_running_transaction == NULL);
 	journal->j_running_transaction = transaction;
+	transaction->t_max_wait = 0;
+	transaction->t_start = CURRENT_MSECS;
 
 	return transaction;
 }
@@ -85,6 +87,7 @@ static int start_this_handle(journal_t *
 	int nblocks = handle->h_buffer_credits;
 	transaction_t *new_transaction = NULL;
 	int ret = 0;
+	unsigned long ts = CURRENT_MSECS;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -218,6 +221,12 @@ repeat_locked:
 	/* OK, account for the buffers that this operation expects to
 	 * use and add the handle to the running transaction. */
 
+	if (time_after(transaction->t_start, ts)) {
+		ts = jbd2_time_diff(ts, transaction->t_start);
+		if (ts > transaction->t_max_wait)
+			transaction->t_max_wait= ts;
+	}
+
 	handle->h_transaction = transaction;
 	transaction->t_outstanding_credits += nblocks;
 	transaction->t_updates++;
Index: linux-2.6.22-rc4-kamikaze1/fs/jbd2/checkpoint.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/jbd2/checkpoint.c
+++ linux-2.6.22-rc4-kamikaze1/fs/jbd2/checkpoint.c
@@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count)
+			struct buffer_head **bhs, int *batch_count,
+			transaction_t *transaction)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
@@ -250,6 +251,7 @@ static int __process_buffer(journal_t *j
 		transaction_t *t = jh->b_transaction;
 		tid_t tid = t->t_tid;
 
+		transaction->t_chp_stats.cs_forced_to_close++;
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		jbd2_log_start_commit(journal, tid);
@@ -279,6 +281,7 @@ static int __process_buffer(journal_t *j
 		bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
+		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
@@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *jo
 	if (!journal->j_checkpoint_transactions)
 		goto out;
 	transaction = journal->j_checkpoint_transactions;
+	if (transaction->t_chp_stats.cs_chp_time == 0)
+		transaction->t_chp_stats.cs_chp_time = CURRENT_MSECS;
 	this_tid = transaction->t_tid;
 restart:
 	/*
@@ -346,7 +351,8 @@ restart:
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			retry = __process_buffer(journal, jh, bhs, &batch_count,
+						 transaction);
 			if (!retry && lock_need_resched(&journal->j_list_lock)){
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
@@ -668,6 +674,8 @@ void __jbd2_journal_insert_checkpoint(st
 
 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 {
+	struct transaction_stats_s stats;
+
 	assert_spin_locked(&journal->j_list_lock);
 	if (transaction->t_cpnext) {
 		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
@@ -693,5 +701,25 @@ void __jbd2_journal_drop_transaction(jou
 	J_ASSERT(journal->j_running_transaction != transaction);
 
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+
+	/*
+	 * File the transaction for history
+	 */
+	if (transaction->t_chp_stats.cs_written != 0 ||
+			transaction->t_chp_stats.cs_chp_time != 0) {
+		stats.ts_type = JBD2_STATS_CHECKPOINT;
+		stats.ts_tid = transaction->t_tid;
+		stats.u.chp = transaction->t_chp_stats;
+		if (stats.ts_chp_time)
+			stats.ts_chp_time =
+				jbd2_time_diff(stats.ts_chp_time, CURRENT_MSECS);
+		spin_lock(&journal->j_history_lock);
+		memcpy(journal->j_history + journal->j_history_cur, &stats,
+				sizeof(stats));
+		if (++journal->j_history_cur == journal->j_history_max)
+			journal->j_history_cur = 0;
+		spin_unlock(&journal->j_history_lock);
+	}
+
 	kfree(transaction);
 }
Index: linux-2.6.22-rc4-kamikaze1/fs/jbd2/commit.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/jbd2/commit.c
+++ linux-2.6.22-rc4-kamikaze1/fs/jbd2/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/jiffies.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -290,6 +291,7 @@ static inline void write_tag_block(int t
  */
 void jbd2_journal_commit_transaction(journal_t *journal)
 {
+	struct transaction_stats_s stats;
 	transaction_t *commit_transaction;
 	struct journal_head *jh, *new_jh, *descriptor;
 	struct buffer_head **wbuf = journal->j_wbuf;
@@ -337,6 +339,11 @@ void jbd2_journal_commit_transaction(jou
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	stats.ts_wait = commit_transaction->t_max_wait;
+	stats.ts_locked = CURRENT_MSECS;
+	stats.ts_running = jbd2_time_diff(commit_transaction->t_start,
+						stats.ts_locked);
+
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -407,6 +414,9 @@ void jbd2_journal_commit_transaction(jou
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	stats.ts_flushing = CURRENT_MSECS;
+	stats.ts_locked = jbd2_time_diff(stats.ts_locked, stats.ts_flushing);
+
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
@@ -498,6 +508,11 @@ void jbd2_journal_commit_transaction(jou
 	 */
 	commit_transaction->t_state = T_COMMIT;
 
+	stats.ts_logging = CURRENT_MSECS;
+	stats.ts_flushing = jbd2_time_diff(stats.ts_flushing, stats.ts_logging);
+	stats.ts_blocks = commit_transaction->t_outstanding_credits;
+	stats.ts_blocks_logged = 0;
+
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -646,6 +661,7 @@ start_journal_io:
 				submit_bh(WRITE, bh);
 			}
 			cond_resched();
+			stats.ts_blocks_logged += bufs;
 
 			/* Force a new descriptor to be generated next
                            time round the loop. */
@@ -816,6 +832,7 @@ restart_loop:
 		cp_transaction = jh->b_cp_transaction;
 		if (cp_transaction) {
 			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			cp_transaction->t_chp_stats.cs_dropped++;
 			__jbd2_journal_remove_checkpoint(jh);
 		}
 
@@ -890,6 +907,36 @@ restart_loop:
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
+	commit_transaction->t_start = CURRENT_MSECS;
+	stats.ts_logging = jbd2_time_diff(stats.ts_logging,
+						commit_transaction->t_start);
+
+	/*
+	 * File the transaction for history
+	 */
+	stats.ts_type = JBD2_STATS_RUN;
+	stats.ts_tid = commit_transaction->t_tid;
+	stats.ts_handle_count = commit_transaction->t_handle_count;
+	spin_lock(&journal->j_history_lock);
+	memcpy(journal->j_history + journal->j_history_cur, &stats,
+			sizeof(stats));
+	if (++journal->j_history_cur == journal->j_history_max)
+		journal->j_history_cur = 0;
+
+	/*
+	 * Calculate overall stats
+	 */
+	journal->j_stats.ts_tid++;
+	journal->j_stats.ts_wait += stats.ts_wait;
+	journal->j_stats.ts_running += stats.ts_running;
+	journal->j_stats.ts_locked += stats.ts_locked;
+	journal->j_stats.ts_flushing += stats.ts_flushing;
+	journal->j_stats.ts_logging += stats.ts_logging;
+	journal->j_stats.ts_handle_count += stats.ts_handle_count;
+	journal->j_stats.ts_blocks += stats.ts_blocks;
+	journal->j_stats.ts_blocks_logged += stats.ts_blocks_logged;
+	spin_unlock(&journal->j_history_lock);
+
 	commit_transaction->t_state = T_FINISHED;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
Index: linux-2.6.22-rc4-kamikaze1/fs/jbd2/journal.c
===================================================================
--- linux-2.6.22-rc4-kamikaze1.orig/fs/jbd2/journal.c
+++ linux-2.6.22-rc4-kamikaze1/fs/jbd2/journal.c
@@ -35,6 +35,7 @@
 #include <linux/kthread.h>
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -640,6 +641,300 @@ struct journal_head *jbd2_journal_get_de
 	return jbd2_journal_add_journal_head(bh);
 }
 
+struct jbd2_stats_proc_session {
+	journal_t *journal;
+	struct transaction_stats_s *stats;
+	int start;
+	int max;
+};
+
+static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
+					struct transaction_stats_s *ts,
+					int first)
+{
+	if (ts == s->stats + s->max)
+		ts = s->stats;
+	if (!first && ts == s->stats + s->start)
+		return NULL;
+	while (ts->ts_type == 0) {
+		ts++;
+		if (ts == s->stats + s->max)
+			ts = s->stats;
+		if (ts == s->stats + s->start)
+			return NULL;
+	}
+	return ts;
+
+}
+
+static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+	struct transaction_stats_s *ts;
+	int l = *pos;
+
+	if (l == 0)
+		return SEQ_START_TOKEN;
+	ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
+	if (!ts)
+		return NULL;
+	while (--l && (ts = jbd2_history_skip_empty(s, ++ts, 0)) != NULL);
+	return ts;
+}
+
+static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+	struct transaction_stats_s *ts = v;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return jbd2_history_skip_empty(s, s->stats + s->start, 1);
+	else
+		return jbd2_history_skip_empty(s, ++ts, 0);
+}
+
+static int jbd2_seq_history_show(struct seq_file *seq, void *v)
+{
+	struct transaction_stats_s *ts = v;
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
+				"%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
+				"wait", "run", "lock", "flush", "log", "hndls",
+				"block", "inlog", "ctime", "write", "drop",
+				"close");
+		return 0;
+	}
+	if (ts->ts_type == JBD2_STATS_RUN)
+		seq_printf(seq, "%-4s %-5lu %-5lu %-5lu %-5lu %-5lu %-5lu "
+				"%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
+				ts->ts_wait, ts->ts_running, ts->ts_locked,
+				ts->ts_flushing, ts->ts_logging,
+				ts->ts_handle_count, ts->ts_blocks,
+				ts->ts_blocks_logged);
+	else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
+		seq_printf(seq, "%-4s %-5lu %48s %-5lu %-5lu %-5lu %-5lu\n",
+				"C", ts->ts_tid, " ", ts->ts_chp_time,
+				ts->ts_written, ts->ts_dropped,
+				ts->ts_forced_to_close);
+	else
+		J_ASSERT(0);
+	return 0;
+}
+
+static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations jbd2_seq_history_ops = {
+	.start  = jbd2_seq_history_start,
+	.next   = jbd2_seq_history_next,
+	.stop   = jbd2_seq_history_stop,
+	.show   = jbd2_seq_history_show,
+};
+
+static int jbd2_seq_history_open(struct inode *inode, struct file *file)
+{
+	journal_t *journal = PDE(inode)->data;
+	struct jbd2_stats_proc_session *s;
+	int rc, size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -EIO;
+	size = sizeof(struct transaction_stats_s) * journal->j_history_max;
+	s->stats = kmalloc(size, GFP_KERNEL);
+	if (s == NULL) {
+		kfree(s);
+		return -EIO;
+	}
+	spin_lock(&journal->j_history_lock);
+	memcpy(s->stats, journal->j_history, size);
+	s->max = journal->j_history_max;
+	s->start = journal->j_history_cur % s->max;
+	spin_unlock(&journal->j_history_lock);
+
+	rc = seq_open(file, &jbd2_seq_history_ops);
+	if (rc == 0) {
+		struct seq_file *m = (struct seq_file *)file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->stats);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int jbd2_seq_history_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct jbd2_stats_proc_session *s = seq->private;
+	kfree(s->stats);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static struct file_operations jbd2_seq_history_fops = {
+	.owner		= THIS_MODULE,
+	.open           = jbd2_seq_history_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = jbd2_seq_history_release,
+};
+
+static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int jbd2_seq_info_show(struct seq_file *seq, void *v)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+	if (v != SEQ_START_TOKEN)
+		return 0;
+	seq_printf(seq, "%lu transaction, each upto %u blocks\n",
+			s->stats->ts_tid,
+			s->journal->j_max_transaction_buffers);
+	if (s->stats->ts_tid == 0)
+		return 0;
+	seq_printf(seq, "average: \n  %lums waiting for transaction\n",
+			s->stats->ts_wait / s->stats->ts_tid);
+	seq_printf(seq, "  %lums running transaction\n",
+			s->stats->ts_running / s->stats->ts_tid);
+	seq_printf(seq, "  %lums transaction was being locked\n",
+			s->stats->ts_locked / s->stats->ts_tid);
+	seq_printf(seq, "  %lums flushing data (in ordered mode)\n",
+			s->stats->ts_flushing / s->stats->ts_tid);
+	seq_printf(seq, "  %lums logging transaction\n",
+			s->stats->ts_logging / s->stats->ts_tid);
+	seq_printf(seq, "  %lu handles per transaction\n",
+			s->stats->ts_handle_count / s->stats->ts_tid);
+	seq_printf(seq, "  %lu blocks per transaction\n",
+			s->stats->ts_blocks / s->stats->ts_tid);
+	seq_printf(seq, "  %lu logged blocks per transaction\n",
+			s->stats->ts_blocks_logged / s->stats->ts_tid);
+	return 0;
+}
+
+static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations jbd2_seq_info_ops = {
+	.start  = jbd2_seq_info_start,
+	.next   = jbd2_seq_info_next,
+	.stop   = jbd2_seq_info_stop,
+	.show   = jbd2_seq_info_show,
+};
+
+static int jbd2_seq_info_open(struct inode *inode, struct file *file)
+{
+	journal_t *journal = PDE(inode)->data;
+	struct jbd2_stats_proc_session *s;
+	int rc, size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -EIO;
+	size = sizeof(struct transaction_stats_s);
+	s->stats = kmalloc(size, GFP_KERNEL);
+	if (s == NULL) {
+		kfree(s);
+		return -EIO;
+	}
+	spin_lock(&journal->j_history_lock);
+	memcpy(s->stats, &journal->j_stats, size);
+	s->journal = journal;
+	spin_unlock(&journal->j_history_lock);
+
+	rc = seq_open(file, &jbd2_seq_info_ops);
+	if (rc == 0) {
+		struct seq_file *m = (struct seq_file *)file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->stats);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int jbd2_seq_info_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct jbd2_stats_proc_session *s = seq->private;
+	kfree(s->stats);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static struct file_operations jbd2_seq_info_fops = {
+	.owner		= THIS_MODULE,
+	.open           = jbd2_seq_info_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = jbd2_seq_info_release,
+};
+
+static struct proc_dir_entry *proc_jbd2_stats = NULL;
+
+static void jbd2_stats_proc_init(journal_t *journal)
+{
+	char name[64];
+
+	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+	if (journal->j_proc_entry) {
+		struct proc_dir_entry *p;
+		p = create_proc_entry("history", S_IRUGO,
+				journal->j_proc_entry);
+		if (p) {
+			p->proc_fops = &jbd2_seq_history_fops;
+			p->data = journal;
+			p = create_proc_entry("info", S_IRUGO,
+						journal->j_proc_entry);
+			if (p) {
+				p->proc_fops = &jbd2_seq_info_fops;
+				p->data = journal;
+			}
+		}
+	}
+}
+
+static void jbd2_stats_proc_exit(journal_t *journal)
+{
+	char name[64];
+
+	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	remove_proc_entry("info", journal->j_proc_entry);
+	remove_proc_entry("history", journal->j_proc_entry);
+	remove_proc_entry(name, proc_jbd2_stats);
+}
+
+static void journal_init_stats(journal_t *journal)
+{
+	int size;
+
+	if (proc_jbd2_stats == NULL)
+		return;
+
+	journal->j_history_max = 100;
+	size = sizeof(struct transaction_stats_s) * journal->j_history_max;
+	journal->j_history = kmalloc(size, GFP_KERNEL);
+	if (journal->j_history == NULL) {
+		journal->j_history_max = 0;
+		return;
+	}
+	memset(journal->j_history, 0, size);
+	spin_lock_init(&journal->j_history_lock);
+}
+
 /*
  * Management for journal control blocks: functions to create and
  * destroy journal_t structures, and to initialise and read existing
@@ -682,6 +977,9 @@ static journal_t * journal_init_common (
 		kfree(journal);
 		goto fail;
 	}
+
+	journal_init_stats(journal);
+
 	return journal;
 fail:
 	return NULL;
@@ -732,6 +1030,7 @@ journal_t * jbd2_journal_init_dev(struct
 		journal = NULL;
 		goto out;
 	}
+ 	jbd2_stats_proc_init(journal);
 	journal->j_dev = bdev;
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
@@ -774,6 +1073,7 @@ journal_t * jbd2_journal_init_inode (str
 
 	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
 	journal->j_blocksize = inode->i_sb->s_blocksize;
+	jbd2_stats_proc_init(journal);
 
 	/* journal descriptor can store up to n blocks -bzzz */
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
@@ -1161,6 +1461,8 @@ void jbd2_journal_destroy(journal_t *jou
 		brelse(journal->j_sb_buffer);
 	}
 
+	if (journal->j_proc_entry)
+		jbd2_stats_proc_exit(journal);
 	if (journal->j_inode)
 		iput(journal->j_inode);
 	if (journal->j_revoke)
@@ -2011,6 +2313,28 @@ static void __exit jbd2_remove_jbd_proc_
 
 #endif
 
+#if defined(CONFIG_PROC_FS)
+
+#define JBD2_STATS_PROC_NAME "fs/jbd2"
+
+static void __init jbd2_create_jbd_stats_proc_entry(void)
+{
+	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
+}
+
+static void __exit jbd2_remove_jbd_stats_proc_entry(void)
+{
+	if (proc_jbd2_stats)
+		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
+}
+
+#else
+
+#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
+
+#endif
+
 struct kmem_cache *jbd2_handle_cache;
 
 static int __init journal_init_handle_cache(void)
@@ -2068,6 +2392,7 @@ static int __init journal_init(void)
 	if (ret != 0)
 		jbd2_journal_destroy_caches();
 	create_jbd_proc_entry();
+	jbd2_create_jbd_stats_proc_entry();
 	return ret;
 }
 
@@ -2079,6 +2404,7 @@ static void __exit journal_exit(void)
 		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
 #endif
 	jbd2_remove_jbd_proc_entry();
+	jbd2_remove_jbd_stats_proc_entry();
 	jbd2_journal_destroy_caches();
 }
 
