The same as reiser4-for-2.6.22.patch plus a fix for file conversion
related bug wich caused metadata corruption when REISER4_DEBUG is on.

diff -urN linux-2.6.22.orig/arch/i386/lib/usercopy.c linux-2.6.22/arch/i386/lib/usercopy.c
--- linux-2.6.22.orig/arch/i386/lib/usercopy.c	2007-07-21 00:32:46.973831675 +0400
+++ linux-2.6.22/arch/i386/lib/usercopy.c	2007-07-29 00:25:34.800676805 +0400
@@ -817,6 +817,7 @@
 #endif
 	return n;
 }
+EXPORT_SYMBOL(__copy_from_user_ll_nocache);
 
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
@@ -831,6 +832,7 @@
 #endif
 	return n;
 }
+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
 
 /**
  * copy_to_user: - Copy a block of data into user space.
diff -urN linux-2.6.22.orig/Documentation/Changes linux-2.6.22/Documentation/Changes
--- linux-2.6.22.orig/Documentation/Changes	2007-07-21 00:31:57.012856483 +0400
+++ linux-2.6.22/Documentation/Changes	2007-07-29 00:25:34.800676805 +0400
@@ -36,6 +36,7 @@
 o  e2fsprogs              1.29                    # tune2fs
 o  jfsutils               1.1.3                   # fsck.jfs -V
 o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
+o  reiser4progs           1.0.0                   # fsck.reiser4 -V
 o  xfsprogs               2.6.0                   # xfs_db -V
 o  pcmciautils            004                     # pccardctl -V
 o  quota-tools            3.09                    # quota -V
@@ -144,6 +145,13 @@
 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
 reiserfsck. These utils work on both i386 and alpha platforms.
 
+Reiser4progs
+------------
+
+The reiser4progs package contains utilities for the reiser4 file system.
+Detailed instructions are provided in the README file located at:
+<ftp://ftp.namesys.com/pub/reiser4progs/README>.
+
 Xfsprogs
 --------
 
@@ -322,6 +330,10 @@
 -------------
 o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
 
+Reiser4progs
+------------
+o  <ftp://ftp.namesys.com/pub/reiser4progs/>
+
 Xfsprogs
 --------
 o  <ftp://oss.sgi.com/projects/xfs/download/>
diff -urN linux-2.6.22.orig/Documentation/filesystems/reiser4.txt linux-2.6.22/Documentation/filesystems/reiser4.txt
--- linux-2.6.22.orig/Documentation/filesystems/reiser4.txt	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/Documentation/filesystems/reiser4.txt	2007-07-29 00:25:34.800676805 +0400
@@ -0,0 +1,75 @@
+Reiser4 filesystem
+==================
+Reiser4 is a file system based on dancing tree algorithms, and is
+described at http://www.namesys.com
+
+
+References
+==========
+web page		http://namesys.com/v4/v4.html
+source code		ftp://ftp.namesys.com/pub/reiser4-for-2.6/
+userland tools		ftp://ftp.namesys.com/pub/reiser4progs/
+install page		http://www.namesys.com/install_v4.html
+
+Compile options
+===============
+Enable reiser4 debug mode
+       This checks everything imaginable while reiser4
+       runs
+
+Mount options
+=============
+tmgr.atom_max_size=N
+	Atoms containing more than N blocks will be forced to commit.
+	N is decimal.
+	Default is nr_free_pagecache_pages() / 2 at mount time.
+
+tmgr.atom_max_age=N
+	Atoms older than N seconds will be forced to commit. N is decimal.
+	Default is 600.
+
+tmgr.atom_max_flushers=N
+	Limit of concurrent flushers for one atom. 0 means no limit.
+	Default is 0.
+
+tree.cbk_cache.nr_slots=N
+	Number of slots in the cbk cache.
+
+flush.relocate_threshold=N
+	If flush finds more than N adjacent dirty leaf-level blocks it
+	will force them to be relocated.
+	Default is 64.
+
+flush.relocate_distance=N
+	If flush finds can find a block allocation closer than at most
+	N from the preceder it will relocate to that position.
+	Default is 64.
+
+flush.scan_maxnodes=N
+	The maximum number of nodes to scan left on a level during
+	flush.
+	Default is 10000.
+
+optimal_io_size=N
+	Preferred IO size. This value is used to set st_blksize of
+	struct stat.
+	Default is 65536.
+
+bsdgroups
+	Turn on BSD-style gid assignment.
+
+32bittimes
+	By default file in reiser4 have 64 bit timestamps. Files
+	created when filesystem is mounted with 32bittimes mount
+	option will get 32 bit timestamps.
+
+mtflush
+	Turn off concurrent flushing.
+
+nopseudo
+	Disable pseudo files support. See
+	http://namesys.com/v4/pseudo.html for more about pseudo files.
+
+dont_load_bitmap
+	Don't load all bitmap blocks at mount time, it is useful for
+	machines with tiny RAM and large disks.
diff -urN linux-2.6.22.orig/fs/fs-writeback.c linux-2.6.22/fs/fs-writeback.c
--- linux-2.6.22.orig/fs/fs-writeback.c	2007-07-21 00:32:04.502801671 +0400
+++ linux-2.6.22/fs/fs-writeback.c	2007-07-29 00:25:34.808678876 +0400
@@ -296,8 +296,6 @@
  * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
  * that it can be located for waiting on in __writeback_single_inode().
  *
- * Called under inode_lock.
- *
  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
  * This function assumes that the blockdev superblock's inodes are backed by
  * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -313,11 +311,13 @@
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on __wait_on_inode.
  */
-static void
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+void
+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
+	spin_lock(&inode_lock);
+
 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
 		list_splice_init(&sb->s_dirty, &sb->s_io);
 
@@ -397,8 +397,19 @@
 		if (wbc->nr_to_write <= 0)
 			break;
 	}
+	spin_unlock(&inode_lock);
 	return;		/* Leave any unwritten inodes on s_io */
 }
+EXPORT_SYMBOL(generic_sync_sb_inodes);
+
+static void
+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+{
+	if (sb->s_op->sync_inodes)
+		sb->s_op->sync_inodes(sb, wbc);
+	else
+		generic_sync_sb_inodes(sb, wbc);
+}
 
 /*
  * Start writeback of dirty pagecache data against all unlocked inodes.
@@ -439,11 +450,8 @@
 			 * be unmounted by the time it is released.
 			 */
 			if (down_read_trylock(&sb->s_umount)) {
-				if (sb->s_root) {
-					spin_lock(&inode_lock);
+				if (sb->s_root)
 					sync_sb_inodes(sb, wbc);
-					spin_unlock(&inode_lock);
-				}
 				up_read(&sb->s_umount);
 			}
 			spin_lock(&sb_lock);
@@ -481,9 +489,7 @@
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
 			nr_dirty + nr_unstable;
 	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
-	spin_lock(&inode_lock);
 	sync_sb_inodes(sb, &wbc);
-	spin_unlock(&inode_lock);
 }
 
 /*
diff -urN linux-2.6.22.orig/fs/Kconfig linux-2.6.22/fs/Kconfig
--- linux-2.6.22.orig/fs/Kconfig	2007-07-21 00:32:57.540575927 +0400
+++ linux-2.6.22/fs/Kconfig	2007-07-29 00:25:34.812679911 +0400
@@ -272,6 +272,8 @@
 	default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
 	default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
 
+source "fs/reiser4/Kconfig"
+
 config REISERFS_FS
 	tristate "Reiserfs support"
 	help
diff -urN linux-2.6.22.orig/fs/Makefile linux-2.6.22/fs/Makefile
--- linux-2.6.22.orig/fs/Makefile	2007-07-21 00:32:57.544576967 +0400
+++ linux-2.6.22/fs/Makefile	2007-07-29 00:25:34.812679911 +0400
@@ -66,6 +66,7 @@
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
+obj-$(CONFIG_REISER4_FS)	+= reiser4/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_EXT4DEV_FS)	+= ext4/ # Before ext2 so root fs can be ext4dev
 obj-$(CONFIG_JBD)		+= jbd/
diff -urN linux-2.6.22.orig/fs/reiser4/as_ops.c linux-2.6.22/fs/reiser4/as_ops.c
--- linux-2.6.22.orig/fs/reiser4/as_ops.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/as_ops.c	2007-07-29 00:25:34.816680947 +0400
@@ -0,0 +1,337 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/file/file.h"
+#include "plugin/security/perm.h"
+#include "plugin/disk_format/disk_format.h"
+#include "plugin/plugin.h"
+#include "plugin/plugin_set.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "reiser4.h"
+#include "entd.h"
+
+#include <linux/profile.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/quotaops.h>
+#include <linux/security.h>
+
+/* address space operations */
+
+/**
+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
+ * @page: page to be dirtied
+ *
+ * Operation of struct address_space_operations. This implementation is used by
+ * unix and cryptcompress file plugins.
+ *
+ * This is called when reiser4 page gets dirtied outside of reiser4, for
+ * example, when dirty bit is moved from pte to physical page.
+ *
+ * Tags page in the mapping's page tree with special tag so that it is possible
+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
+ * capturing by an atom) later because it can not be done in the contexts where
+ * set_page_dirty is called.
+ */
+int reiser4_set_page_dirty(struct page *page)
+{
+	/* this page can be unformatted only */
+	assert("vs-1734", (page->mapping &&
+			   page->mapping->host &&
+			   reiser4_get_super_fake(page->mapping->host->i_sb) !=
+			   page->mapping->host
+			   && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
+			   page->mapping->host
+			   && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
+			   page->mapping->host));
+
+	if (!TestSetPageDirty(page)) {
+		struct address_space *mapping = page->mapping;
+
+		if (mapping) {
+			write_lock_irq(&mapping->tree_lock);
+
+			/* check for race with truncate */
+			if (page->mapping) {
+				assert("vs-1652", page->mapping == mapping);
+				if (mapping_cap_account_dirty(mapping))
+					inc_zone_page_state(page,
+							NR_FILE_DIRTY);
+				radix_tree_tag_set(&mapping->page_tree,
+						   page->index,
+						   PAGECACHE_TAG_REISER4_MOVED);
+			}
+			write_unlock_irq(&mapping->tree_lock);
+			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+		}
+	}
+	return 0;
+}
+
+/* ->invalidatepage method for reiser4 */
+
+/*
+ * this is called for each truncated page from
+ * truncate_inode_pages()->truncate_{complete,partial}_page().
+ *
+ * At the moment of call, page is under lock, and outstanding io (if any) has
+ * completed.
+ */
+
+/**
+ * reiser4_invalidatepage
+ * @page: page to invalidate
+ * @offset: starting offset for partial invalidation
+ *
+ */
+void reiser4_invalidatepage(struct page *page, unsigned long offset)
+{
+	int ret = 0;
+	reiser4_context *ctx;
+	struct inode *inode;
+	jnode *node;
+
+	/*
+	 * This is called to truncate file's page.
+	 *
+	 * Originally, reiser4 implemented truncate in a standard way
+	 * (vmtruncate() calls ->invalidatepage() on all truncated pages
+	 * first, then file system ->truncate() call-back is invoked).
+	 *
+	 * This lead to the problem when ->invalidatepage() was called on a
+	 * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
+	 * process. That is, truncate was bypassing transactions. To avoid
+	 * this, try_capture_page_to_invalidate() call was added here.
+	 *
+	 * After many troubles with vmtruncate() based truncate (including
+	 * races with flush, tail conversion, etc.) it was re-written in the
+	 * top-to-bottom style: items are killed in reiser4_cut_tree_object()
+	 * and pages belonging to extent are invalidated in kill_hook_extent().
+	 * So probably now additional call to capture is not needed here.
+	 */
+
+	assert("nikita-3137", PageLocked(page));
+	assert("nikita-3138", !PageWriteback(page));
+	inode = page->mapping->host;
+
+	/*
+	 * ->invalidatepage() should only be called for the unformatted
+	 * jnodes. Destruction of all other types of jnodes is performed
+	 * separately. But, during some corner cases (like handling errors
+	 * during mount) it is simpler to let ->invalidatepage to be called on
+	 * them. Check for this, and do nothing.
+	 */
+	if (reiser4_get_super_fake(inode->i_sb) == inode)
+		return;
+	if (reiser4_get_cc_fake(inode->i_sb) == inode)
+		return;
+	if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
+		return;
+	assert("vs-1426", PagePrivate(page));
+	assert("vs-1427",
+	       page->mapping == jnode_get_mapping(jnode_by_page(page)));
+	assert("", jprivate(page) != NULL);
+	assert("", ergo(inode_file_plugin(inode) !=
+			file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
+			offset == 0));
+
+	ctx = reiser4_init_context(inode->i_sb);
+	if (IS_ERR(ctx))
+		return;
+
+	node = jprivate(page);
+	spin_lock_jnode(node);
+	if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
+			  (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
+		/* there is not need to capture */
+		jref(node);
+		JF_SET(node, JNODE_HEARD_BANSHEE);
+		page_clear_jnode(page, node);
+		reiser4_uncapture_jnode(node);
+		unhash_unformatted_jnode(node);
+		jput(node);
+		reiser4_exit_context(ctx);
+		return;
+	}
+	spin_unlock_jnode(node);
+
+	/* capture page being truncated. */
+	ret = try_capture_page_to_invalidate(page);
+	if (ret != 0)
+		warning("nikita-3141", "Cannot capture: %i", ret);
+
+	if (offset == 0) {
+		/* remove jnode from transaction and detach it from page. */
+		jref(node);
+		JF_SET(node, JNODE_HEARD_BANSHEE);
+		/* page cannot be detached from jnode concurrently, because it
+		 * is locked */
+		reiser4_uncapture_page(page);
+
+		/* this detaches page from jnode, so that jdelete will not try
+		 * to lock page which is already locked */
+		spin_lock_jnode(node);
+		page_clear_jnode(page, node);
+		spin_unlock_jnode(node);
+		unhash_unformatted_jnode(node);
+
+		jput(node);
+	}
+
+	reiser4_exit_context(ctx);
+}
+
+/* help function called from reiser4_releasepage(). It returns true if jnode
+ * can be detached from its page and page released. */
+int jnode_is_releasable(jnode * node /* node to check */ )
+{
+	assert("nikita-2781", node != NULL);
+	assert_spin_locked(&(node->guard));
+	assert_spin_locked(&(node->load));
+
+	/* is some thread is currently using jnode page, later cannot be
+	 * detached */
+	if (atomic_read(&node->d_count) != 0) {
+		return 0;
+	}
+
+	assert("vs-1214", !jnode_is_loaded(node));
+
+	/*
+	 * can only release page if real block number is assigned to it. Simple
+	 * check for ->atom wouldn't do, because it is possible for node to be
+	 * clean, not it atom yet, and still having fake block number. For
+	 * example, node just created in jinit_new().
+	 */
+	if (reiser4_blocknr_is_fake(jnode_get_block(node)))
+		return 0;
+
+	/*
+	 * pages prepared for write can not be released anyway, so avoid
+	 * detaching jnode from the page
+	 */
+	if (JF_ISSET(node, JNODE_WRITE_PREPARED))
+		return 0;
+
+	/*
+	 * dirty jnode cannot be released. It can however be submitted to disk
+	 * as part of early flushing, but only after getting flush-prepped.
+	 */
+	if (JF_ISSET(node, JNODE_DIRTY))
+		return 0;
+
+	/* overwrite set is only written by log writer. */
+	if (JF_ISSET(node, JNODE_OVRWR))
+		return 0;
+
+	/* jnode is already under writeback */
+	if (JF_ISSET(node, JNODE_WRITEBACK))
+		return 0;
+
+	/* don't flush bitmaps or journal records */
+	if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * ->releasepage method for reiser4
+ *
+ * This is called by VM scanner when it comes across clean page.  What we have
+ * to do here is to check whether page can really be released (freed that is)
+ * and if so, detach jnode from it and remove page from the page cache.
+ *
+ * Check for releasability is done by releasable() function.
+ */
+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
+{
+	jnode *node;
+
+	assert("nikita-2257", PagePrivate(page));
+	assert("nikita-2259", PageLocked(page));
+	assert("nikita-2892", !PageWriteback(page));
+	assert("nikita-3019", reiser4_schedulable());
+
+	/* NOTE-NIKITA: this can be called in the context of reiser4 call. It
+	   is not clear what to do in this case. A lot of deadlocks seems be
+	   possible. */
+
+	node = jnode_by_page(page);
+	assert("nikita-2258", node != NULL);
+	assert("reiser4-4", page->mapping != NULL);
+	assert("reiser4-5", page->mapping->host != NULL);
+
+	if (PageDirty(page))
+		return 0;
+
+	/* extra page reference is used by reiser4 to protect
+	 * jnode<->page link from this ->releasepage(). */
+	if (page_count(page) > 3)
+		return 0;
+
+	/* releasable() needs jnode lock, because it looks at the jnode fields
+	 * and we need jload_lock here to avoid races with jload(). */
+	spin_lock_jnode(node);
+	spin_lock(&(node->load));
+	if (jnode_is_releasable(node)) {
+		struct address_space *mapping;
+
+		mapping = page->mapping;
+		jref(node);
+		/* there is no need to synchronize against
+		 * jnode_extent_write() here, because pages seen by
+		 * jnode_extent_write() are !releasable(). */
+		page_clear_jnode(page, node);
+		spin_unlock(&(node->load));
+		spin_unlock_jnode(node);
+
+		/* we are under memory pressure so release jnode also. */
+		jput(node);
+
+		return 1;
+	} else {
+		spin_unlock(&(node->load));
+		spin_unlock_jnode(node);
+		assert("nikita-3020", reiser4_schedulable());
+		return 0;
+	}
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/block_alloc.c linux-2.6.22/fs/reiser4/block_alloc.c
--- linux-2.6.22.orig/fs/reiser4/block_alloc.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/block_alloc.c	2007-07-29 00:25:34.816680947 +0400
@@ -0,0 +1,1137 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "debug.h"
+#include "dformat.h"
+#include "plugin/plugin.h"
+#include "txnmgr.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "super.h"
+
+#include <linux/types.h>	/* for __u??  */
+#include <linux/fs.h>		/* for struct super_block  */
+#include <linux/spinlock.h>
+
+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
+
+/* We need to be able to reserve enough disk space to ensure that an atomic
+   operation will have enough disk space to flush (see flush.c and
+   http://namesys.com/v4/v4.html) and commit it once it is started.
+
+   In our design a call for reserving disk space may fail but not an actual
+   block allocation.
+
+   All free blocks, already allocated blocks, and all kinds of reserved blocks
+   are counted in different per-fs block counters.
+
+   A reiser4 super block's set of block counters currently is:
+
+   free -- free blocks,
+   used -- already allocated blocks,
+
+   grabbed -- initially reserved for performing an fs operation, those blocks
+          are taken from free blocks, then grabbed disk space leaks from grabbed
+          blocks counter to other counters like "fake allocated", "flush
+          reserved", "used", the rest of not used grabbed space is returned to
+          free space at the end of fs operation;
+
+   fake allocated -- counts all nodes without real disk block numbers assigned,
+                     we have separate accounting for formatted and unformatted
+                     nodes (for easier debugging);
+
+   flush reserved -- disk space needed for flushing and committing an atom.
+                     Each dirty already allocated block could be written as a
+                     part of atom's overwrite set or as a part of atom's
+                     relocate set.  In both case one additional block is needed,
+                     it is used as a wandered block if we do overwrite or as a
+		     new location for a relocated block.
+
+   In addition, blocks in some states are counted on per-thread and per-atom
+   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
+   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
+   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
+   blocks, which are reserved for flush processing and atom commit. */
+
+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
+   number of blocks to grab for most expensive case of balancing when the leaf
+   node we insert new item to gets split and new leaf node is allocated.
+
+   So, we need to grab blocks for
+
+   1) one block for possible dirtying the node we insert an item to. That block
+      would be used for node relocation at flush time or for allocating of a
+      wandered one, it depends what will be a result (what set, relocate or
+      overwrite the node gets assigned to) of the node processing by the flush
+      algorithm.
+
+   2) one block for either allocating a new node, or dirtying of right or left
+      clean neighbor, only one case may happen.
+
+   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
+   node, and creation of new node.  have I forgotten something?  email me.
+
+   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
+   counter and in the fs-wide one (both ctx->grabbed_blocks and
+   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
+   decremented by 2.
+
+   Suppose both two blocks were spent for dirtying of an already allocated clean
+   node (one block went from "grabbed" to "flush reserved") and for new block
+   allocating (one block went from "grabbed" to "fake allocated formatted").
+
+   Inserting of a child pointer to the parent node caused parent node to be
+   split, the balancing code takes care about this grabbing necessary space
+   immediately by calling reiser4_grab with BA_RESERVED flag set which means
+   "can use the 5% reserved disk space".
+
+   At this moment insertion completes and grabbed blocks (if they were not used)
+   should be returned to the free space counter.
+
+   However the atom life-cycle is not completed.  The atom had one "flush
+   reserved" block added by our insertion and the new fake allocated node is
+   counted as a "fake allocated formatted" one.  The atom has to be fully
+   processed by flush before commit.  Suppose that the flush moved the first,
+   already allocated node to the atom's overwrite list, the new fake allocated
+   node, obviously, went into the atom relocate set.  The reiser4 flush
+   allocates the new node using one unit from "fake allocated formatted"
+   counter, the log writer uses one from "flush reserved" for wandered block
+   allocation.
+
+   And, it is not the end.  When the wandered block is deallocated after the
+   atom gets fully played (see wander.c for term description), the disk space
+   occupied for it is returned to free blocks. */
+
+/* BLOCK NUMBERS */
+
+/* Any reiser4 node has a block number assigned to it.  We use these numbers for
+   indexing in hash tables, so if a block has not yet been assigned a location
+   on disk we need to give it a temporary fake block number.
+
+   Current implementation of reiser4 uses 64-bit integers for block numbers. We
+   use highest bit in 64-bit block number to distinguish fake and real block
+   numbers. So, only 63 bits may be used to addressing of real device
+   blocks. That "fake" block numbers space is divided into subspaces of fake
+   block numbers for data blocks and for shadow (working) bitmap blocks.
+
+   Fake block numbers for data blocks are generated by a cyclic counter, which
+   gets incremented after each real block allocation. We assume that it is
+   impossible to overload this counter during one transaction life. */
+
+/* Initialize a blocknr hint. */
+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
+{
+	memset(hint, 0, sizeof(reiser4_blocknr_hint));
+}
+
+/* Release any resources of a blocknr hint. */
+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
+{
+	/* No resources should be freed in current blocknr_hint implementation. */
+}
+
+/* see above for explanation of fake block number.  */
+/* Audited by: green(2002.06.11) */
+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
+{
+	/* The reason for not simply returning result of '&' operation is that
+	   while return value is (possibly 32bit) int,  the reiser4_block_nr is
+	   at least 64 bits long, and high bit (which is the only possible
+	   non zero bit after the masking) would be stripped off */
+	return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
+}
+
+/* Static functions for <reiser4 super block>/<reiser4 context> block counters
+   arithmetic. Mostly, they are isolated to not to code same assertions in
+   several places. */
+static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
+{
+	BUG_ON(ctx->grabbed_blocks < count);
+	assert("zam-527", ctx->grabbed_blocks >= count);
+	ctx->grabbed_blocks -= count;
+}
+
+static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
+{
+	ctx->grabbed_blocks += count;
+}
+
+static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
+{
+	assert("zam-525", sbinfo->blocks_grabbed >= count);
+	sbinfo->blocks_grabbed -= count;
+}
+
+/* Decrease the counter of block reserved for flush in super block. */
+static void
+sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
+{
+	assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
+	sbinfo->blocks_flush_reserved -= count;
+}
+
+static void
+sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
+			   reiser4_ba_flags_t flags)
+{
+	if (flags & BA_FORMATTED) {
+		assert("zam-806", sbinfo->blocks_fake_allocated >= count);
+		sbinfo->blocks_fake_allocated -= count;
+	} else {
+		assert("zam-528",
+		       sbinfo->blocks_fake_allocated_unformatted >= count);
+		sbinfo->blocks_fake_allocated_unformatted -= count;
+	}
+}
+
+static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
+{
+	assert("zam-530",
+	       sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
+	sbinfo->blocks_used -= count;
+}
+
+static void
+sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
+{
+	assert("edward-501", sbinfo->blocks_clustered >= count);
+	sbinfo->blocks_clustered -= count;
+}
+
+/* Increase the counter of block reserved for flush in atom. */
+static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
+{
+	assert("zam-772", atom != NULL);
+	assert_spin_locked(&(atom->alock));
+	atom->flush_reserved += count;
+}
+
+/* Decrease the counter of block reserved for flush in atom. */
+static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
+{
+	assert("zam-774", atom != NULL);
+	assert_spin_locked(&(atom->alock));
+	assert("nikita-2790", atom->flush_reserved >= count);
+	atom->flush_reserved -= count;
+}
+
+/* super block has 6 counters: free, used, grabbed, fake allocated
+   (formatted and unformatted) and flush reserved. Their sum must be
+   number of blocks on a device. This function checks this */
+int reiser4_check_block_counters(const struct super_block *super)
+{
+	__u64 sum;
+
+	sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
+	    reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
+	    reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
+	    reiser4_clustered_blocks(super);
+	if (reiser4_block_count(super) != sum) {
+		printk("super block counters: "
+		       "used %llu, free %llu, "
+		       "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
+		       "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
+		       (unsigned long long)reiser4_data_blocks(super),
+		       (unsigned long long)reiser4_free_blocks(super),
+		       (unsigned long long)reiser4_grabbed_blocks(super),
+		       (unsigned long long)reiser4_fake_allocated(super),
+		       (unsigned long long)
+		       reiser4_fake_allocated_unformatted(super),
+		       (unsigned long long)reiser4_flush_reserved(super),
+		       (unsigned long long)reiser4_clustered_blocks(super),
+		       (unsigned long long)sum,
+		       (unsigned long long)reiser4_block_count(super));
+		return 0;
+	}
+	return 1;
+}
+
+/* Adjust "working" free blocks counter for number of blocks we are going to
+   allocate.  Record number of grabbed blocks in fs-wide and per-thread
+   counters.  This function should be called before bitmap scanning or
+   allocating fake block numbers
+
+   @super           -- pointer to reiser4 super block;
+   @count           -- number of blocks we reserve;
+
+   @return          -- 0 if success,  -ENOSPC, if all
+                       free blocks are preserved or already allocated.
+*/
+
+static int
+reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
+{
+	__u64 free_blocks;
+	int ret = 0, use_reserved = flags & BA_RESERVED;
+	reiser4_super_info_data *sbinfo;
+
+	assert("vs-1276", ctx == get_current_context());
+
+	/* Do not grab anything on ro-mounted fs. */
+	if (rofs_super(ctx->super)) {
+		ctx->grab_enabled = 0;
+		return 0;
+	}
+
+	sbinfo = get_super_private(ctx->super);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	free_blocks = sbinfo->blocks_free;
+
+	if ((use_reserved && free_blocks < count) ||
+	    (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
+		ret = RETERR(-ENOSPC);
+		goto unlock_and_ret;
+	}
+
+	add_to_ctx_grabbed(ctx, count);
+
+	sbinfo->blocks_grabbed += count;
+	sbinfo->blocks_free -= count;
+
+#if REISER4_DEBUG
+	if (ctx->grabbed_initially == 0)
+		ctx->grabbed_initially = count;
+#endif
+
+	assert("nikita-2986", reiser4_check_block_counters(ctx->super));
+
+	/* disable grab space in current context */
+	ctx->grab_enabled = 0;
+
+      unlock_and_ret:
+	spin_unlock_reiser4_super(sbinfo);
+
+	return ret;
+}
+
+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
+{
+	int ret;
+	reiser4_context *ctx;
+
+	assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
+				   lock_stack_isclean(get_current_lock_stack
+						      ())));
+	ctx = get_current_context();
+	if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
+		return 0;
+	}
+
+	ret = reiser4_grab(ctx, count, flags);
+	if (ret == -ENOSPC) {
+
+		/* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
+		if (flags & BA_CAN_COMMIT) {
+			txnmgr_force_commit_all(ctx->super, 0);
+			ctx->grab_enabled = 1;
+			ret = reiser4_grab(ctx, count, flags);
+		}
+	}
+	/*
+	 * allocation from reserved pool cannot fail. This is severe error.
+	 */
+	assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
+	return ret;
+}
+
+/*
+ * SPACE RESERVED FOR UNLINK/TRUNCATE
+ *
+ * Unlink and truncate require space in transaction (to update stat data, at
+ * least). But we don't want rm(1) to fail with "No space on device" error.
+ *
+ * Solution is to reserve 5% of disk space for truncates and
+ * unlinks. Specifically, normal space grabbing requests don't grab space from
+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
+ * drain it. Per super block delete mutex is used to allow only one
+ * thread at a time to grab from reserved area.
+ *
+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
+ * flag.
+ *
+ */
+
+int reiser4_grab_reserved(struct super_block *super,
+			  __u64 count, reiser4_ba_flags_t flags)
+{
+	reiser4_super_info_data *sbinfo = get_super_private(super);
+
+	assert("nikita-3175", flags & BA_CAN_COMMIT);
+
+	/* Check the delete mutex already taken by us, we assume that
+	 * reading of machine word is atomic. */
+	if (sbinfo->delete_mutex_owner == current) {
+		if (reiser4_grab_space
+		    (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
+			warning("zam-1003",
+				"nested call of grab_reserved fails count=(%llu)",
+				(unsigned long long)count);
+			reiser4_release_reserved(super);
+			return RETERR(-ENOSPC);
+		}
+		return 0;
+	}
+
+	if (reiser4_grab_space(count, flags)) {
+		mutex_lock(&sbinfo->delete_mutex);
+		assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
+		sbinfo->delete_mutex_owner = current;
+
+		if (reiser4_grab_space(count, flags | BA_RESERVED)) {
+			warning("zam-833",
+				"reserved space is not enough (%llu)",
+				(unsigned long long)count);
+			reiser4_release_reserved(super);
+			return RETERR(-ENOSPC);
+		}
+	}
+	return 0;
+}
+
+void reiser4_release_reserved(struct super_block *super)
+{
+	reiser4_super_info_data *info;
+
+	info = get_super_private(super);
+	if (info->delete_mutex_owner == current) {
+		info->delete_mutex_owner = NULL;
+		mutex_unlock(&info->delete_mutex);
+	}
+}
+
+static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sub_from_ctx_grabbed(ctx, count);
+
+	sbinfo = get_super_private(ctx->super);
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	/* return sbinfo locked */
+	return sbinfo;
+}
+
+/* is called after @count fake block numbers are allocated and pointer to
+   those blocks are inserted into tree. */
+static void grabbed2fake_allocated_formatted(void)
+{
+	reiser4_super_info_data *sbinfo;
+
+	sbinfo = grabbed2fake_allocated_head(1);
+	sbinfo->blocks_fake_allocated++;
+
+	assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/**
+ * grabbed2fake_allocated_unformatted
+ * @count:
+ *
+ */
+static void grabbed2fake_allocated_unformatted(int count)
+{
+	reiser4_super_info_data *sbinfo;
+
+	sbinfo = grabbed2fake_allocated_head(count);
+	sbinfo->blocks_fake_allocated_unformatted += count;
+
+	assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+void grabbed2cluster_reserved(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sub_from_ctx_grabbed(ctx, count);
+
+	sbinfo = get_super_private(ctx->super);
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	sbinfo->blocks_clustered += count;
+
+	assert("edward-504", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+void cluster_reserved2grabbed(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+
+	sbinfo = get_super_private(ctx->super);
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_cluster_reserved(sbinfo, count);
+	sbinfo->blocks_grabbed += count;
+
+	assert("edward-505", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+	add_to_ctx_grabbed(ctx, count);
+}
+
+void cluster_reserved2free(int count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	cluster_reserved2grabbed(count);
+	grabbed2free(ctx, sbinfo, count);
+}
+
+static DEFINE_SPINLOCK(fake_lock);
+static reiser4_block_nr fake_gen = 0;
+
+/**
+ * assign_fake_blocknr
+ * @blocknr:
+ * @count:
+ *
+ * Obtain a fake block number for new node which will be used to refer to
+ * this newly allocated node until real allocation is done.
+ */
+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
+{
+	spin_lock(&fake_lock);
+	*blocknr = fake_gen;
+	fake_gen += count;
+	spin_unlock(&fake_lock);
+
+	BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
+	/**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
+	*blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
+	assert("zam-394", zlook(current_tree, blocknr) == NULL);
+}
+
+int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
+{
+	assign_fake_blocknr(blocknr, 1);
+	grabbed2fake_allocated_formatted();
+	return 0;
+}
+
+/**
+ * fake_blocknrs_unformatted
+ * @count: number of fake numbers to get
+ *
+ * Allocates @count fake block numbers which will be assigned to jnodes
+ */
+reiser4_block_nr fake_blocknr_unformatted(int count)
+{
+	reiser4_block_nr blocknr;
+
+	assign_fake_blocknr(&blocknr, count);
+	grabbed2fake_allocated_unformatted(count);
+
+	return blocknr;
+}
+
+/* adjust sb block counters, if real (on-disk) block allocation immediately
+   follows grabbing of free disk space. */
+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
+			 __u64 count)
+{
+	sub_from_ctx_grabbed(ctx, count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	sbinfo->blocks_used += count;
+
+	assert("nikita-2679", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/* adjust sb block counters when @count unallocated blocks get mapped to disk */
+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
+				reiser4_ba_flags_t flags)
+{
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_sb_fake_allocated(sbinfo, count, flags);
+	sbinfo->blocks_used += count;
+
+	assert("nikita-2680",
+	       reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+static void flush_reserved2used(txn_atom * atom, __u64 count)
+{
+	reiser4_super_info_data *sbinfo;
+
+	assert("zam-787", atom != NULL);
+	assert_spin_locked(&(atom->alock));
+
+	sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
+
+	sbinfo = get_current_super_private();
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_sb_flush_reserved(sbinfo, count);
+	sbinfo->blocks_used += count;
+
+	assert("zam-789",
+	       reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/* update the per fs  blocknr hint default value. */
+void
+update_blocknr_hint_default(const struct super_block *s,
+			    const reiser4_block_nr * block)
+{
+	reiser4_super_info_data *sbinfo = get_super_private(s);
+
+	assert("nikita-3342", !reiser4_blocknr_is_fake(block));
+
+	spin_lock_reiser4_super(sbinfo);
+	if (*block < sbinfo->block_count) {
+		sbinfo->blocknr_hint_default = *block;
+	} else {
+		warning("zam-676",
+			"block number %llu is too large to be used in a blocknr hint\n",
+			(unsigned long long)*block);
+		dump_stack();
+		DEBUGON(1);
+	}
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/* get current value of the default blocknr hint. */
+void get_blocknr_hint_default(reiser4_block_nr * result)
+{
+	reiser4_super_info_data *sbinfo = get_current_super_private();
+
+	spin_lock_reiser4_super(sbinfo);
+	*result = sbinfo->blocknr_hint_default;
+	assert("zam-677", *result < sbinfo->block_count);
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/* Allocate "real" disk blocks by calling a proper space allocation plugin
+ * method. Blocks are allocated in one contiguous disk region. The plugin
+ * independent part accounts blocks by subtracting allocated amount from grabbed
+ * or fake block counter and add the same amount to the counter of allocated
+ * blocks.
+ *
+ * @hint -- a reiser4 blocknr hint object which contains further block
+ *          allocation hints and parameters (search start, a stage of block
+ *          which will be mapped to disk, etc.),
+ * @blk  -- an out parameter for the beginning of the allocated region,
+ * @len  -- in/out parameter, it should contain the maximum number of allocated
+ *          blocks, after block allocation completes, it contains the length of
+ *          allocated disk region.
+ * @flags -- see reiser4_ba_flags_t description.
+ *
+ * @return -- 0 if success, error code otherwise.
+ */
+int
+reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
+		     reiser4_block_nr * len, reiser4_ba_flags_t flags)
+{
+	__u64 needed = *len;
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+	int ret;
+
+	assert("zam-986", hint != NULL);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	/* For write-optimized data we use default search start value, which is
+	 * close to last write location. */
+	if (flags & BA_USE_DEFAULT_SEARCH_START) {
+		get_blocknr_hint_default(&hint->blk);
+	}
+
+	/* VITALY: allocator should grab this for internal/tx-lists/similar only. */
+/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
+	if (hint->block_stage == BLOCK_NOT_COUNTED) {
+		ret = reiser4_grab_space_force(*len, flags);
+		if (ret != 0)
+			return ret;
+	}
+
+	ret =
+	    sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
+			    hint, (int)needed, blk, len);
+
+	if (!ret) {
+		assert("zam-680", *blk < reiser4_block_count(ctx->super));
+		assert("zam-681",
+		       *blk + *len <= reiser4_block_count(ctx->super));
+
+		if (flags & BA_PERMANENT) {
+			/* we assume that current atom exists at this moment */
+			txn_atom *atom = get_current_atom_locked();
+			atom->nr_blocks_allocated += *len;
+			spin_unlock_atom(atom);
+		}
+
+		switch (hint->block_stage) {
+		case BLOCK_NOT_COUNTED:
+		case BLOCK_GRABBED:
+			grabbed2used(ctx, sbinfo, *len);
+			break;
+		case BLOCK_UNALLOCATED:
+			fake_allocated2used(sbinfo, *len, flags);
+			break;
+		case BLOCK_FLUSH_RESERVED:
+			{
+				txn_atom *atom = get_current_atom_locked();
+				flush_reserved2used(atom, *len);
+				spin_unlock_atom(atom);
+			}
+			break;
+		default:
+			impossible("zam-531", "wrong block stage");
+		}
+	} else {
+		assert("zam-821",
+		       ergo(hint->max_dist == 0
+			    && !hint->backward, ret != -ENOSPC));
+		if (hint->block_stage == BLOCK_NOT_COUNTED)
+			grabbed2free(ctx, sbinfo, needed);
+	}
+
+	return ret;
+}
+
+/* used -> fake_allocated -> grabbed -> free */
+
+/* adjust sb block counters when @count unallocated blocks get unmapped from
+   disk */
+static void
+used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
+		    int formatted)
+{
+	spin_lock_reiser4_super(sbinfo);
+
+	if (formatted)
+		sbinfo->blocks_fake_allocated += count;
+	else
+		sbinfo->blocks_fake_allocated_unformatted += count;
+
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2681",
+	       reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+static void
+used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
+		    __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
+{
+	assert("nikita-2791", atom != NULL);
+	assert_spin_locked(&(atom->alock));
+
+	add_to_atom_flush_reserved_nolock(atom, (__u32) count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	sbinfo->blocks_flush_reserved += count;
+	/*add_to_sb_flush_reserved(sbinfo, count); */
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2681",
+	       reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
+static void
+fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
+		       __u64 count, reiser4_ba_flags_t flags)
+{
+	add_to_ctx_grabbed(ctx, count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	assert("nikita-2682", reiser4_check_block_counters(ctx->super));
+
+	sbinfo->blocks_grabbed += count;
+	sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
+
+	assert("nikita-2683", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	fake_allocated2grabbed(ctx, sbinfo, count, flags);
+	grabbed2free(ctx, sbinfo, count);
+}
+
+void grabbed2free_mark(__u64 mark)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	assert("nikita-3007", (__s64) mark >= 0);
+	assert("nikita-3006", ctx->grabbed_blocks >= mark);
+	grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
+}
+
+/**
+ * grabbed2free - adjust grabbed and free block counters
+ * @ctx: context to update grabbed block counter of
+ * @sbinfo: super block to update grabbed and free block counters of
+ * @count: number of blocks to adjust counters by
+ *
+ * Decreases context's and per filesystem's counters of grabbed
+ * blocks. Increases per filesystem's counter of free blocks.
+ */
+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
+		  __u64 count)
+{
+	sub_from_ctx_grabbed(ctx, count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	sub_from_sb_grabbed(sbinfo, count);
+	sbinfo->blocks_free += count;
+	assert("nikita-2684", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	assert("vs-1095", atom);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	sub_from_ctx_grabbed(ctx, count);
+
+	add_to_atom_flush_reserved_nolock(atom, count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	sbinfo->blocks_flush_reserved += count;
+	sub_from_sb_grabbed(sbinfo, count);
+
+	assert("vpf-292", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+void grabbed2flush_reserved(__u64 count)
+{
+	txn_atom *atom = get_current_atom_locked();
+
+	grabbed2flush_reserved_nolock(atom, count);
+
+	spin_unlock_atom(atom);
+}
+
+void flush_reserved2grabbed(txn_atom * atom, __u64 count)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	assert("nikita-2788", atom != NULL);
+	assert_spin_locked(&(atom->alock));
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	add_to_ctx_grabbed(ctx, count);
+
+	sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	sbinfo->blocks_grabbed += count;
+	sub_from_sb_flush_reserved(sbinfo, count);
+
+	assert("vpf-292", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/**
+ * all_grabbed2free - releases all blocks grabbed in context
+ *
+ * Decreases context's and super block's grabbed block counters by number of
+ * blocks grabbed by current context and increases super block's free block
+ * counter correspondingly.
+ */
+void all_grabbed2free(void)
+{
+	reiser4_context *ctx = get_current_context();
+
+	grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
+}
+
+/* adjust sb block counters if real (on-disk) blocks do not become unallocated
+   after freeing, @count blocks become "grabbed". */
+static void
+used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
+	     __u64 count)
+{
+	add_to_ctx_grabbed(ctx, count);
+
+	spin_lock_reiser4_super(sbinfo);
+
+	sbinfo->blocks_grabbed += count;
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2685", reiser4_check_block_counters(ctx->super));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+/* this used to be done through used2grabbed and grabbed2free*/
+static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
+{
+	spin_lock_reiser4_super(sbinfo);
+
+	sbinfo->blocks_free += count;
+	sub_from_sb_used(sbinfo, count);
+
+	assert("nikita-2685",
+	       reiser4_check_block_counters(reiser4_get_current_sb()));
+
+	spin_unlock_reiser4_super(sbinfo);
+}
+
+#if REISER4_DEBUG
+
+/* check "allocated" state of given block range */
+static void
+reiser4_check_blocks(const reiser4_block_nr * start,
+		     const reiser4_block_nr * len, int desired)
+{
+	sa_check_blocks(start, len, desired);
+}
+
+/* check "allocated" state of given block */
+void reiser4_check_block(const reiser4_block_nr * block, int desired)
+{
+	const reiser4_block_nr one = 1;
+
+	reiser4_check_blocks(block, &one, desired);
+}
+
+#endif
+
+/* Blocks deallocation function may do an actual deallocation through space
+   plugin allocation or store deleted block numbers in atom's delete_set data
+   structure depend on @defer parameter. */
+
+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
+   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
+   freed but disk space is still grabbed by current thread, or these blocks must
+   not be counted in any reiser4 sb block counters, see block_stage_t comment */
+
+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
+   distinguish blocks allocated for unformatted and formatted nodes */
+
+int
+reiser4_dealloc_blocks(const reiser4_block_nr * start,
+		       const reiser4_block_nr * len,
+		       block_stage_t target_stage, reiser4_ba_flags_t flags)
+{
+	txn_atom *atom = NULL;
+	int ret;
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	if (REISER4_DEBUG) {
+		assert("zam-431", *len != 0);
+		assert("zam-432", *start != 0);
+		assert("zam-558", !reiser4_blocknr_is_fake(start));
+
+		spin_lock_reiser4_super(sbinfo);
+		assert("zam-562", *start < sbinfo->block_count);
+		spin_unlock_reiser4_super(sbinfo);
+	}
+
+	if (flags & BA_DEFER) {
+		blocknr_set_entry *bsep = NULL;
+
+		/* storing deleted block numbers in a blocknr set
+		   datastructure for further actual deletion */
+		do {
+			atom = get_current_atom_locked();
+			assert("zam-430", atom != NULL);
+
+			ret =
+			    blocknr_set_add_extent(atom, &atom->delete_set,
+						   &bsep, start, len);
+
+			if (ret == -ENOMEM)
+				return ret;
+
+			/* This loop might spin at most two times */
+		} while (ret == -E_REPEAT);
+
+		assert("zam-477", ret == 0);
+		assert("zam-433", atom != NULL);
+
+		spin_unlock_atom(atom);
+
+	} else {
+		assert("zam-425", get_current_super_private() != NULL);
+		sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
+				  *start, *len);
+
+		if (flags & BA_PERMANENT) {
+			/* These blocks were counted as allocated, we have to revert it
+			 * back if allocation is discarded. */
+			txn_atom *atom = get_current_atom_locked();
+			atom->nr_blocks_allocated -= *len;
+			spin_unlock_atom(atom);
+		}
+
+		switch (target_stage) {
+		case BLOCK_NOT_COUNTED:
+			assert("vs-960", flags & BA_FORMATTED);
+			/* VITALY: This is what was grabbed for internal/tx-lists/similar only */
+			used2free(sbinfo, *len);
+			break;
+
+		case BLOCK_GRABBED:
+			used2grabbed(ctx, sbinfo, *len);
+			break;
+
+		case BLOCK_UNALLOCATED:
+			used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
+			break;
+
+		case BLOCK_FLUSH_RESERVED:{
+				txn_atom *atom;
+
+				atom = get_current_atom_locked();
+				used2flush_reserved(sbinfo, atom, *len,
+						    flags & BA_FORMATTED);
+				spin_unlock_atom(atom);
+				break;
+			}
+		default:
+			impossible("zam-532", "wrong block stage");
+		}
+	}
+
+	return 0;
+}
+
+/* wrappers for block allocator plugin methods */
+int reiser4_pre_commit_hook(void)
+{
+	assert("zam-502", get_current_super_private() != NULL);
+	sa_pre_commit_hook();
+	return 0;
+}
+
+/* an actor which applies delete set to block allocator data */
+static int
+apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
+	   const reiser4_block_nr * b, void *data UNUSED_ARG)
+{
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	__u64 len = 1;
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
+	assert("zam-552", sbinfo != NULL);
+
+	if (b != NULL)
+		len = *b;
+
+	if (REISER4_DEBUG) {
+		spin_lock_reiser4_super(sbinfo);
+
+		assert("zam-554", *a < reiser4_block_count(ctx->super));
+		assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
+
+		spin_unlock_reiser4_super(sbinfo);
+	}
+
+	sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
+	/* adjust sb block counters */
+	used2free(sbinfo, len);
+	return 0;
+}
+
+void reiser4_post_commit_hook(void)
+{
+	txn_atom *atom;
+
+	atom = get_current_atom_locked();
+	assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
+	spin_unlock_atom(atom);
+
+	/* do the block deallocation which was deferred
+	   until commit is done */
+	blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
+
+	assert("zam-504", get_current_super_private() != NULL);
+	sa_post_commit_hook();
+}
+
+void reiser4_post_write_back_hook(void)
+{
+	assert("zam-504", get_current_super_private() != NULL);
+
+	sa_post_commit_hook();
+}
+
+/*
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/block_alloc.h linux-2.6.22/fs/reiser4/block_alloc.h
--- linux-2.6.22.orig/fs/reiser4/block_alloc.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/block_alloc.h	2007-07-29 00:25:34.820681982 +0400
@@ -0,0 +1,175 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
+#define __FS_REISER4_BLOCK_ALLOC_H__
+
+#include "dformat.h"
+#include "forward.h"
+
+#include <linux/types.h>	/* for __u??  */
+#include <linux/fs.h>
+
+/* Mask when is applied to given block number shows is that block number is a fake one */
+#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
+/* Mask which isolates a type of object this fake block number was assigned to */
+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
+
+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
+   against these two values to understand is the object unallocated or bitmap
+   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
+#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
+
+/* specification how block allocation was counted in sb block counters */
+typedef enum {
+	BLOCK_NOT_COUNTED = 0,	/* reiser4 has no info about this block yet */
+	BLOCK_GRABBED = 1,	/* free space grabbed for further allocation
+				   of this block */
+	BLOCK_FLUSH_RESERVED = 2,	/* block is reserved for flush needs. */
+	BLOCK_UNALLOCATED = 3,	/* block is used for existing in-memory object
+				   ( unallocated formatted or unformatted
+				   node) */
+	BLOCK_ALLOCATED = 4	/* block is mapped to disk, real on-disk block
+				   number assigned */
+} block_stage_t;
+
+/* a hint for block allocator */
+struct reiser4_blocknr_hint {
+	/* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
+	   is to prevent jnode_flush() calls from interleaving allocations on the same
+	   bitmap, once a hint is established. */
+
+	/* search start hint */
+	reiser4_block_nr blk;
+	/* if not zero, it is a region size we search for free blocks in */
+	reiser4_block_nr max_dist;
+	/* level for allocation, may be useful have branch-level and higher
+	   write-optimized. */
+	tree_level level;
+	/* block allocator assumes that blocks, which will be mapped to disk,
+	   are in this specified block_stage */
+	block_stage_t block_stage;
+	/* If direction = 1 allocate blocks in backward direction from the end
+	 * of disk to the beginning of disk.  */
+	unsigned int backward:1;
+
+};
+
+/* These flags control block allocation/deallocation behavior */
+enum reiser4_ba_flags {
+	/* do allocatations from reserved (5%) area */
+	BA_RESERVED = (1 << 0),
+
+	/* block allocator can do commit trying to recover free space */
+	BA_CAN_COMMIT = (1 << 1),
+
+	/* if operation will be applied to formatted block */
+	BA_FORMATTED = (1 << 2),
+
+	/* defer actual block freeing until transaction commit */
+	BA_DEFER = (1 << 3),
+
+	/* allocate blocks for permanent fs objects (formatted or unformatted), not
+	   wandered of log blocks */
+	BA_PERMANENT = (1 << 4),
+
+	/* grab space even it was disabled */
+	BA_FORCE = (1 << 5),
+
+	/* use default start value for free blocks search. */
+	BA_USE_DEFAULT_SEARCH_START = (1 << 6)
+};
+
+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
+
+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
+extern void update_blocknr_hint_default(const struct super_block *,
+					const reiser4_block_nr *);
+extern void get_blocknr_hint_default(reiser4_block_nr *);
+
+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
+
+int assign_fake_blocknr_formatted(reiser4_block_nr *);
+reiser4_block_nr fake_blocknr_unformatted(int);
+
+/* free -> grabbed -> fake_allocated -> used */
+
+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
+void all_grabbed2free(void);
+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
+void grabbed2flush_reserved(__u64 count);
+int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
+			 reiser4_block_nr * start,
+			 reiser4_block_nr * len, reiser4_ba_flags_t flags);
+int reiser4_dealloc_blocks(const reiser4_block_nr *,
+			   const reiser4_block_nr *,
+			   block_stage_t, reiser4_ba_flags_t flags);
+
+static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
+				      reiser4_block_nr * start,
+				      reiser4_ba_flags_t flags)
+{
+	reiser4_block_nr one = 1;
+	return reiser4_alloc_blocks(hint, start, &one, flags);
+}
+
+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
+					block_stage_t stage,
+					reiser4_ba_flags_t flags)
+{
+	const reiser4_block_nr one = 1;
+	return reiser4_dealloc_blocks(block, &one, stage, flags);
+}
+
+#define reiser4_grab_space_force(count, flags)		\
+	reiser4_grab_space(count, flags | BA_FORCE)
+
+extern void grabbed2free_mark(__u64 mark);
+extern int reiser4_grab_reserved(struct super_block *,
+				 __u64, reiser4_ba_flags_t);
+extern void reiser4_release_reserved(struct super_block *super);
+
+/* grabbed -> fake_allocated */
+
+/* fake_allocated -> used */
+
+/* used -> fake_allocated -> grabbed -> free */
+
+extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
+
+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
+
+extern void grabbed2cluster_reserved(int count);
+extern void cluster_reserved2grabbed(int count);
+extern void cluster_reserved2free(int count);
+
+extern int reiser4_check_block_counters(const struct super_block *);
+
+#if REISER4_DEBUG
+
+extern void reiser4_check_block(const reiser4_block_nr *, int);
+
+#else
+
+#  define reiser4_check_block(beg, val)        noop
+
+#endif
+
+extern int reiser4_pre_commit_hook(void);
+extern void reiser4_post_commit_hook(void);
+extern void reiser4_post_write_back_hook(void);
+
+#endif				/* __FS_REISER4_BLOCK_ALLOC_H__ */
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/blocknrset.c linux-2.6.22/fs/reiser4/blocknrset.c
--- linux-2.6.22.orig/fs/reiser4/blocknrset.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/blocknrset.c	2007-07-29 00:25:34.820681982 +0400
@@ -0,0 +1,368 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* This file contains code for various block number sets used by the atom to
+   track the deleted set and wandered block mappings. */
+
+#include "debug.h"
+#include "dformat.h"
+#include "txnmgr.h"
+#include "context.h"
+
+#include <linux/slab.h>
+
+/* The proposed data structure for storing unordered block number sets is a
+   list of elements, each of which contains an array of block number or/and
+   array of block number pairs. That element called blocknr_set_entry is used
+   to store block numbers from the beginning and for extents from the end of
+   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
+   count numbers of blocks and extents.
+
+   +------------------- blocknr_set_entry->data ------------------+
+   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
+   +------------------------------------------------------------+
+
+   When current blocknr_set_entry is full, allocate a new one. */
+
+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
+ * set (single blocks and block extents), in that case blocknr pair represent an
+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
+ * there represent a (real block) -> (wandered block) mapping. */
+
+/* Protection: blocknr sets belong to reiser4 atom, and
+ * their modifications are performed with the atom lock held */
+
+/* The total size of a blocknr_set_entry. */
+#define BLOCKNR_SET_ENTRY_SIZE 128
+
+/* The number of blocks that can fit the blocknr data area. */
+#define BLOCKNR_SET_ENTRIES_NUMBER		\
+       ((BLOCKNR_SET_ENTRY_SIZE -		\
+         2 * sizeof (unsigned) -		\
+         sizeof(struct list_head)) /		\
+        sizeof(reiser4_block_nr))
+
+/* An entry of the blocknr_set */
+struct blocknr_set_entry {
+	unsigned nr_singles;
+	unsigned nr_pairs;
+	struct list_head link;
+	reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
+};
+
+/* A pair of blocks as recorded in the blocknr_set_entry data. */
+struct blocknr_pair {
+	reiser4_block_nr a;
+	reiser4_block_nr b;
+};
+
+/* Return the number of blocknr slots available in a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static unsigned bse_avail(blocknr_set_entry * bse)
+{
+	unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
+
+	assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
+	cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
+
+	return BLOCKNR_SET_ENTRIES_NUMBER - used;
+}
+
+/* Initialize a blocknr_set_entry. */
+static void bse_init(blocknr_set_entry *bse)
+{
+	bse->nr_singles = 0;
+	bse->nr_pairs = 0;
+	INIT_LIST_HEAD(&bse->link);
+}
+
+/* Allocate and initialize a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static blocknr_set_entry *bse_alloc(void)
+{
+	blocknr_set_entry *e;
+
+	if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
+					   reiser4_ctx_gfp_mask_get())) == NULL)
+		return NULL;
+
+	bse_init(e);
+
+	return e;
+}
+
+/* Free a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static void bse_free(blocknr_set_entry * bse)
+{
+	kfree(bse);
+}
+
+/* Add a block number to a blocknr_set_entry */
+/* Audited by: green(2002.06.11) */
+static void
+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
+{
+	assert("jmacd-5099", bse_avail(bse) >= 1);
+
+	bse->entries[bse->nr_singles++] = *block;
+}
+
+/* Get a pair of block numbers */
+/* Audited by: green(2002.06.11) */
+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
+						unsigned pno)
+{
+	assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
+
+	return (struct blocknr_pair *) (bse->entries +
+					BLOCKNR_SET_ENTRIES_NUMBER -
+					2 * (pno + 1));
+}
+
+/* Add a pair of block numbers to a blocknr_set_entry */
+/* Audited by: green(2002.06.11) */
+static void
+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
+	     const reiser4_block_nr * b)
+{
+	struct blocknr_pair *pair;
+
+	assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
+
+	pair = bse_get_pair(bse, bse->nr_pairs++);
+
+	pair->a = *a;
+	pair->b = *b;
+}
+
+/* Add either a block or pair of blocks to the block number set.  The first
+   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
+   @b is non-NULL a pair is added.  The block number set belongs to atom, and
+   the call is made with the atom lock held.  There may not be enough space in
+   the current blocknr_set_entry.  If new_bsep points to a non-NULL
+   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
+   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
+   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
+   returned with the atom unlocked for the operation to be tried again.  If
+   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
+   used during the call, it will be freed automatically. */
+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
+			   blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
+			   const reiser4_block_nr *b)
+{
+	blocknr_set_entry *bse;
+	unsigned entries_needed;
+
+	assert("jmacd-5101", a != NULL);
+
+	entries_needed = (b == NULL) ? 1 : 2;
+	if (list_empty(bset) ||
+	    bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
+		/* See if a bse was previously allocated. */
+		if (*new_bsep == NULL) {
+			spin_unlock_atom(atom);
+			*new_bsep = bse_alloc();
+			return (*new_bsep != NULL) ? -E_REPEAT :
+				RETERR(-ENOMEM);
+		}
+
+		/* Put it on the head of the list. */
+		list_add(&((*new_bsep)->link), bset);
+
+		*new_bsep = NULL;
+	}
+
+	/* Add the single or pair. */
+	bse = list_entry(bset->next, blocknr_set_entry, link);
+	if (b == NULL) {
+		bse_put_single(bse, a);
+	} else {
+		bse_put_pair(bse, a, b);
+	}
+
+	/* If new_bsep is non-NULL then there was an allocation race, free this copy. */
+	if (*new_bsep != NULL) {
+		bse_free(*new_bsep);
+		*new_bsep = NULL;
+	}
+
+	return 0;
+}
+
+/* Add an extent to the block set.  If the length is 1, it is treated as a
+   single block (e.g., reiser4_set_add_block). */
+/* Audited by: green(2002.06.11) */
+/* Auditor note: Entire call chain cannot hold any spinlocks, because
+   kmalloc might schedule. The only exception is atom spinlock, which is
+   properly freed. */
+int
+blocknr_set_add_extent(txn_atom * atom,
+		       struct list_head * bset,
+		       blocknr_set_entry ** new_bsep,
+		       const reiser4_block_nr * start,
+		       const reiser4_block_nr * len)
+{
+	assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
+	return blocknr_set_add(atom, bset, new_bsep, start,
+			       *len == 1 ? NULL : len);
+}
+
+/* Add a block pair to the block set. It adds exactly a pair, which is checked
+ * by an assertion that both arguments are not null.*/
+/* Audited by: green(2002.06.11) */
+/* Auditor note: Entire call chain cannot hold any spinlocks, because
+   kmalloc might schedule. The only exception is atom spinlock, which is
+   properly freed. */
+int
+blocknr_set_add_pair(txn_atom * atom,
+		     struct list_head * bset,
+		     blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
+		     const reiser4_block_nr * b)
+{
+	assert("jmacd-5103", a != NULL && b != NULL);
+	return blocknr_set_add(atom, bset, new_bsep, a, b);
+}
+
+/* Initialize a blocknr_set. */
+void blocknr_set_init(struct list_head *bset)
+{
+	INIT_LIST_HEAD(bset);
+}
+
+/* Release the entries of a blocknr_set. */
+void blocknr_set_destroy(struct list_head *bset)
+{
+	blocknr_set_entry *bse;
+
+	while (!list_empty(bset)) {
+		bse = list_entry(bset->next, blocknr_set_entry, link);
+		list_del_init(&bse->link);
+		bse_free(bse);
+	}
+}
+
+/* Merge blocknr_set entries out of @from into @into. */
+/* Audited by: green(2002.06.11) */
+/* Auditor comments: This merge does not know if merged sets contain
+   blocks pairs (As for wandered sets) or extents, so it cannot really merge
+   overlapping ranges if there is some. So I believe it may lead to
+   some blocks being presented several times in one blocknr_set. To help
+   debugging such problems it might help to check for duplicate entries on
+   actual processing of this set. Testing this kind of stuff right here is
+   also complicated by the fact that these sets are not sorted and going
+   through whole set on each element addition is going to be CPU-heavy task */
+void blocknr_set_merge(struct list_head * from, struct list_head * into)
+{
+	blocknr_set_entry *bse_into = NULL;
+
+	/* If @from is empty, no work to perform. */
+	if (list_empty(from))
+		return;
+	/* If @into is not empty, try merging partial-entries. */
+	if (!list_empty(into)) {
+
+		/* Neither set is empty, pop the front to members and try to combine them. */
+		blocknr_set_entry *bse_from;
+		unsigned into_avail;
+
+		bse_into = list_entry(into->next, blocknr_set_entry, link);
+		list_del_init(&bse_into->link);
+		bse_from = list_entry(from->next, blocknr_set_entry, link);
+		list_del_init(&bse_from->link);
+
+		/* Combine singles. */
+		for (into_avail = bse_avail(bse_into);
+		     into_avail != 0 && bse_from->nr_singles != 0;
+		     into_avail -= 1) {
+			bse_put_single(bse_into,
+				       &bse_from->entries[--bse_from->
+							  nr_singles]);
+		}
+
+		/* Combine pairs. */
+		for (; into_avail > 1 && bse_from->nr_pairs != 0;
+		     into_avail -= 2) {
+			struct blocknr_pair *pair =
+				bse_get_pair(bse_from, --bse_from->nr_pairs);
+			bse_put_pair(bse_into, &pair->a, &pair->b);
+		}
+
+		/* If bse_from is empty, delete it now. */
+		if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
+			bse_free(bse_from);
+		} else {
+			/* Otherwise, bse_into is full or nearly full (e.g.,
+			   it could have one slot avail and bse_from has one
+			   pair left).  Push it back onto the list.  bse_from
+			   becomes bse_into, which will be the new partial. */
+			list_add(&bse_into->link, into);
+			bse_into = bse_from;
+		}
+	}
+
+	/* Splice lists together. */
+	list_splice_init(from, into->prev);
+
+	/* Add the partial entry back to the head of the list. */
+	if (bse_into != NULL)
+		list_add(&bse_into->link, into);
+}
+
+/* Iterate over all blocknr set elements. */
+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
+			 blocknr_set_actor_f actor, void *data, int delete)
+{
+
+	blocknr_set_entry *entry;
+
+	assert("zam-429", atom != NULL);
+	assert("zam-430", atom_is_protected(atom));
+	assert("zam-431", bset != 0);
+	assert("zam-432", actor != NULL);
+
+	entry = list_entry(bset->next, blocknr_set_entry, link);
+	while (bset != &entry->link) {
+		blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
+		unsigned int i;
+		int ret;
+
+		for (i = 0; i < entry->nr_singles; i++) {
+			ret = actor(atom, &entry->entries[i], NULL, data);
+
+			/* We can't break a loop if delete flag is set. */
+			if (ret != 0 && !delete)
+				return ret;
+		}
+
+		for (i = 0; i < entry->nr_pairs; i++) {
+			struct blocknr_pair *ab;
+
+			ab = bse_get_pair(entry, i);
+
+			ret = actor(atom, &ab->a, &ab->b, data);
+
+			if (ret != 0 && !delete)
+				return ret;
+		}
+
+		if (delete) {
+			list_del(&entry->link);
+			bse_free(entry);
+		}
+
+		entry = tmp;
+	}
+
+	return 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/carry.c linux-2.6.22/fs/reiser4/carry.c
--- linux-2.6.22.orig/fs/reiser4/carry.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/carry.c	2007-07-29 00:25:34.820681982 +0400
@@ -0,0 +1,1391 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Functions to "carry" tree modification(s) upward. */
+/* Tree is modified one level at a time. As we modify a level we accumulate a
+   set of changes that need to be propagated to the next level.  We manage
+   node locking such that any searches that collide with carrying are
+   restarted, from the root if necessary.
+
+   Insertion of a new item may result in items being moved among nodes and
+   this requires the delimiting key to be updated at the least common parent
+   of the nodes modified to preserve search tree invariants. Also, insertion
+   may require allocation of a new node. A pointer to the new node has to be
+   inserted into some node on the parent level, etc.
+
+   Tree carrying is meant to be analogous to arithmetic carrying.
+
+   A carry operation is always associated with some node (&carry_node).
+
+   Carry process starts with some initial set of operations to be performed
+   and an initial set of already locked nodes.  Operations are performed one
+   by one. Performing each single operation has following possible effects:
+
+    - content of carry node associated with operation is modified
+    - new carry nodes are locked and involved into carry process on this level
+    - new carry operations are posted to the next level
+
+   After all carry operations on this level are done, process is repeated for
+   the accumulated sequence on carry operations for the next level. This
+   starts by trying to lock (in left to right order) all carry nodes
+   associated with carry operations on the parent level. After this, we decide
+   whether more nodes are required on the left of already locked set. If so,
+   all locks taken on the parent level are released, new carry nodes are
+   added, and locking process repeats.
+
+   It may happen that balancing process fails owing to unrecoverable error on
+   some of upper levels of a tree (possible causes are io error, failure to
+   allocate new node, etc.). In this case we should unmount the filesystem,
+   rebooting if it is the root, and possibly advise the use of fsck.
+
+   USAGE:
+
+    int some_tree_operation( znode *node, ... )
+    {
+       // Allocate on a stack pool of carry objects: operations and nodes.
+       // Most carry processes will only take objects from here, without
+       // dynamic allocation.
+
+I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
+
+       carry_pool  pool;
+       carry_level lowest_level;
+       carry_op   *op;
+
+       init_carry_pool( &pool );
+       init_carry_level( &lowest_level, &pool );
+
+       // operation may be one of:
+       //   COP_INSERT    --- insert new item into node
+       //   COP_CUT       --- remove part of or whole node
+       //   COP_PASTE     --- increase size of item
+       //   COP_DELETE    --- delete pointer from parent node
+       //   COP_UPDATE    --- update delimiting key in least
+       //                     common ancestor of two
+
+       op = reiser4_post_carry( &lowest_level, operation, node, 0 );
+       if( IS_ERR( op ) || ( op == NULL ) ) {
+           handle error
+       } else {
+           // fill in remaining fields in @op, according to carry.h:carry_op
+           result = carry( &lowest_level, NULL );
+       }
+       done_carry_pool( &pool );
+    }
+
+   When you are implementing node plugin method that participates in carry
+   (shifting, insertion, deletion, etc.), do the following:
+
+   int foo_node_method( znode *node, ..., carry_level *todo )
+   {
+       carry_op   *op;
+
+       ....
+
+       // note, that last argument to reiser4_post_carry() is non-null
+       // here, because @op is to be applied to the parent of @node, rather
+       // than to the @node itself as in the previous case.
+
+       op = node_post_carry( todo, operation, node, 1 );
+       // fill in remaining fields in @op, according to carry.h:carry_op
+
+       ....
+
+   }
+
+   BATCHING:
+
+   One of the main advantages of level-by-level balancing implemented here is
+   ability to batch updates on a parent level and to peform them more
+   efficiently as a result.
+
+   Description To Be Done (TBD).
+
+   DIFFICULTIES AND SUBTLE POINTS:
+
+   1. complex plumbing is required, because:
+
+       a. effective allocation through pools is needed
+
+       b. target of operation is not exactly known when operation is
+       posted. This is worked around through bitfields in &carry_node and
+       logic in lock_carry_node()
+
+       c. of interaction with locking code: node should be added into sibling
+       list when pointer to it is inserted into its parent, which is some time
+       after node was created. Between these moments, node is somewhat in
+       suspended state and is only registered in the carry lists
+
+    2. whole balancing logic is implemented here, in particular, insertion
+    logic is coded in make_space().
+
+    3. special cases like insertion (reiser4_add_tree_root()) or deletion
+    (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
+    (insert_paste()) have to be handled.
+
+    4. there is non-trivial interdependency between allocation of new nodes
+    and almost everything else. This is mainly due to the (1.c) above. I shall
+    write about this later.
+
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/item/extent.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "tree_mod.h"
+#include "tree_walk.h"
+#include "block_alloc.h"
+#include "pool.h"
+#include "tree.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h>
+
+/* level locking/unlocking */
+static int lock_carry_level(carry_level * level);
+static void unlock_carry_level(carry_level * level, int failure);
+static void done_carry_level(carry_level * level);
+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
+
+int lock_carry_node(carry_level * level, carry_node * node);
+int lock_carry_node_tail(carry_node * node);
+
+/* carry processing proper */
+static int carry_on_level(carry_level * doing, carry_level * todo);
+
+static carry_op *add_op(carry_level * level, pool_ordering order,
+			carry_op * reference);
+
+/* handlers for carry operations. */
+
+static void fatal_carry_error(carry_level * doing, int ecode);
+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
+
+static void print_level(const char *prefix, carry_level * level);
+
+#if REISER4_DEBUG
+typedef enum {
+	CARRY_TODO,
+	CARRY_DOING
+} carry_queue_state;
+static int carry_level_invariant(carry_level * level, carry_queue_state state);
+#endif
+
+/* main entry point for tree balancing.
+
+   Tree carry performs operations from @doing and while doing so accumulates
+   information about operations to be performed on the next level ("carried"
+   to the parent level). Carried operations are performed, causing possibly
+   more operations to be carried upward etc. carry() takes care about
+   locking and pinning znodes while operating on them.
+
+   For usage, see comment at the top of fs/reiser4/carry.c
+
+*/
+int reiser4_carry(carry_level * doing /* set of carry operations to be
+				       * performed */ ,
+		  carry_level * done  /* set of nodes, already performed
+				       *  at the previous level.
+				       * NULL in most cases */)
+{
+	int result = 0;
+	/* queue of new requests */
+	carry_level *todo;
+	ON_DEBUG(STORE_COUNTERS);
+
+	assert("nikita-888", doing != NULL);
+	BUG_ON(done != NULL);
+
+	todo = doing + 1;
+	init_carry_level(todo, doing->pool);
+
+	/* queue of requests preformed on the previous level */
+	done = todo + 1;
+	init_carry_level(done, doing->pool);
+
+	/* iterate until there is nothing more to do */
+	while (result == 0 && doing->ops_num > 0) {
+		carry_level *tmp;
+
+		/* at this point @done is locked. */
+		/* repeat lock/do/unlock while
+
+		   (1) lock_carry_level() fails due to deadlock avoidance, or
+
+		   (2) carry_on_level() decides that more nodes have to
+		   be involved.
+
+		   (3) some unexpected error occurred while balancing on the
+		   upper levels. In this case all changes are rolled back.
+
+		 */
+		while (1) {
+			result = lock_carry_level(doing);
+			if (result == 0) {
+				/* perform operations from @doing and
+				   accumulate new requests in @todo */
+				result = carry_on_level(doing, todo);
+				if (result == 0)
+					break;
+				else if (result != -E_REPEAT ||
+					 !doing->restartable) {
+					warning("nikita-1043",
+						"Fatal error during carry: %i",
+						result);
+					print_level("done", done);
+					print_level("doing", doing);
+					print_level("todo", todo);
+					/* do some rough stuff like aborting
+					   all pending transcrashes and thus
+					   pushing tree back to the consistent
+					   state. Alternatvely, just panic.
+					 */
+					fatal_carry_error(doing, result);
+					return result;
+				}
+			} else if (result != -E_REPEAT) {
+				fatal_carry_error(doing, result);
+				return result;
+			}
+			unlock_carry_level(doing, 1);
+		}
+		/* at this point @done can be safely unlocked */
+		done_carry_level(done);
+
+		/* cyclically shift queues */
+		tmp = done;
+		done = doing;
+		doing = todo;
+		todo = tmp;
+		init_carry_level(todo, doing->pool);
+
+		/* give other threads chance to run */
+		reiser4_preempt_point();
+	}
+	done_carry_level(done);
+
+	/* all counters, but x_refs should remain the same. x_refs can change
+	   owing to transaction manager */
+	ON_DEBUG(CHECK_COUNTERS);
+	return result;
+}
+
+/* perform carry operations on given level.
+
+   Optimizations proposed by pooh:
+
+   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
+   required;
+
+   (2) unlock node if there are no more operations to be performed upon it and
+   node didn't add any operation to @todo. This can be implemented by
+   attaching to each node two counters: counter of operaions working on this
+   node and counter and operations carried upward from this node.
+
+*/
+static int carry_on_level(carry_level * doing	/* queue of carry operations to
+						 * do on this level */ ,
+			  carry_level * todo	/* queue where new carry
+						 * operations to be performed on
+						 * the * parent level are
+						 * accumulated during @doing
+						 * processing. */ )
+{
+	int result;
+	int (*f) (carry_op *, carry_level *, carry_level *);
+	carry_op *op;
+	carry_op *tmp_op;
+
+	assert("nikita-1034", doing != NULL);
+	assert("nikita-1035", todo != NULL);
+
+	/* @doing->nodes are locked. */
+
+	/* This function can be split into two phases: analysis and modification.
+
+	   Analysis calculates precisely what items should be moved between
+	   nodes. This information is gathered in some structures attached to
+	   each carry_node in a @doing queue. Analysis also determines whether
+	   new nodes are to be allocated etc.
+
+	   After analysis is completed, actual modification is performed. Here
+	   we can take advantage of "batch modification": if there are several
+	   operations acting on the same node, modifications can be performed
+	   more efficiently when batched together.
+
+	   Above is an optimization left for the future.
+	 */
+	/* Important, but delayed optimization: it's possible to batch
+	   operations together and perform them more efficiently as a
+	   result. For example, deletion of several neighboring items from a
+	   node can be converted to a single ->cut() operation.
+
+	   Before processing queue, it should be scanned and "mergeable"
+	   operations merged.
+	 */
+	result = 0;
+	for_all_ops(doing, op, tmp_op) {
+		carry_opcode opcode;
+
+		assert("nikita-1041", op != NULL);
+		opcode = op->op;
+		assert("nikita-1042", op->op < COP_LAST_OP);
+		f = op_dispatch_table[op->op].handler;
+		result = f(op, doing, todo);
+		/* locking can fail with -E_REPEAT. Any different error is fatal
+		   and will be handled by fatal_carry_error() sledgehammer.
+		 */
+		if (result != 0)
+			break;
+	}
+	if (result == 0) {
+		carry_plugin_info info;
+		carry_node *scan;
+		carry_node *tmp_scan;
+
+		info.doing = doing;
+		info.todo = todo;
+
+		assert("nikita-3002",
+		       carry_level_invariant(doing, CARRY_DOING));
+		for_all_nodes(doing, scan, tmp_scan) {
+			znode *node;
+
+			node = reiser4_carry_real(scan);
+			assert("nikita-2547", node != NULL);
+			if (node_is_empty(node)) {
+				result =
+				    node_plugin_by_node(node)->
+				    prepare_removal(node, &info);
+				if (result != 0)
+					break;
+			}
+		}
+	}
+	return result;
+}
+
+/* post carry operation
+
+   This is main function used by external carry clients: node layout plugins
+   and tree operations to create new carry operation to be performed on some
+   level.
+
+   New operation will be included in the @level queue. To actually perform it,
+   call carry( level, ... ). This function takes write lock on @node. Carry
+   manages all its locks by itself, don't worry about this.
+
+   This function adds operation and node at the end of the queue. It is up to
+   caller to guarantee proper ordering of node queue.
+
+*/
+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
+						   * is to be posted at */ ,
+			      carry_opcode op /* opcode of operation */ ,
+			      znode * node	/* node on which this operation
+						 * will operate */ ,
+			      int apply_to_parent_p /* whether operation will
+						     * operate directly on @node
+						     * or on it parent. */)
+{
+	carry_op *result;
+	carry_node *child;
+
+	assert("nikita-1046", level != NULL);
+	assert("nikita-1788", znode_is_write_locked(node));
+
+	result = add_op(level, POOLO_LAST, NULL);
+	if (IS_ERR(result))
+		return result;
+	child = reiser4_add_carry(level, POOLO_LAST, NULL);
+	if (IS_ERR(child)) {
+		reiser4_pool_free(&level->pool->op_pool, &result->header);
+		return (carry_op *) child;
+	}
+	result->node = child;
+	result->op = op;
+	child->parent = apply_to_parent_p;
+	if (ZF_ISSET(node, JNODE_ORPHAN))
+		child->left_before = 1;
+	child->node = node;
+	return result;
+}
+
+/* initialize carry queue */
+void init_carry_level(carry_level * level /* level to initialize */ ,
+		      carry_pool * pool	/* pool @level will allocate objects
+					 * from */ )
+{
+	assert("nikita-1045", level != NULL);
+	assert("nikita-967", pool != NULL);
+
+	memset(level, 0, sizeof *level);
+	level->pool = pool;
+
+	INIT_LIST_HEAD(&level->nodes);
+	INIT_LIST_HEAD(&level->ops);
+}
+
+/* allocate carry pool and initialize pools within queue */
+carry_pool *init_carry_pool(int size)
+{
+	carry_pool *pool;
+
+	assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
+	pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
+	if (pool == NULL)
+		return ERR_PTR(RETERR(-ENOMEM));
+
+	reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
+			  (char *)pool->op);
+	reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
+			  NODES_LOCKED_POOL_SIZE, (char *)pool->node);
+	return pool;
+}
+
+/* finish with queue pools */
+void done_carry_pool(carry_pool * pool /* pool to destroy */ )
+{
+	reiser4_done_pool(&pool->op_pool);
+	reiser4_done_pool(&pool->node_pool);
+	kfree(pool);
+}
+
+/* add new carry node to the @level.
+
+   Returns pointer to the new carry node allocated from pool.  It's up to
+   callers to maintain proper order in the @level. Assumption is that if carry
+   nodes on one level are already sorted and modifications are peroformed from
+   left to right, carry nodes added on the parent level will be ordered
+   automatically. To control ordering use @order and @reference parameters.
+
+*/
+carry_node *reiser4_add_carry_skip(carry_level * level	/* &carry_level to add
+							 * node to */ ,
+				   pool_ordering order	/* where to insert:
+							 * at the beginning of
+							 * @level,
+							 * before @reference,
+							 * after @reference,
+							 * at the end of @level
+							 */ ,
+				   carry_node * reference/* reference node for
+							  * insertion */)
+{
+	ON_DEBUG(carry_node * orig_ref = reference);
+
+	if (order == POOLO_BEFORE) {
+		reference = find_left_carry(reference, level);
+		if (reference == NULL)
+			reference = list_entry(level->nodes.next, carry_node,
+					       header.level_linkage);
+		else
+			reference = list_entry(reference->header.level_linkage.next,
+					       carry_node, header.level_linkage);
+	} else if (order == POOLO_AFTER) {
+		reference = find_right_carry(reference, level);
+		if (reference == NULL)
+			reference = list_entry(level->nodes.prev, carry_node,
+					       header.level_linkage);
+		else
+			reference = list_entry(reference->header.level_linkage.prev,
+					       carry_node, header.level_linkage);
+	}
+	assert("nikita-2209",
+	       ergo(orig_ref != NULL,
+		    reiser4_carry_real(reference) ==
+		    reiser4_carry_real(orig_ref)));
+	return reiser4_add_carry(level, order, reference);
+}
+
+carry_node *reiser4_add_carry(carry_level * level	/* &carry_level to add node
+						 * to */ ,
+		      pool_ordering order	/* where to insert: at the
+						 * beginning of @level, before
+						 * @reference, after @reference,
+						 * at the end of @level */ ,
+		      carry_node * reference	/* reference node for
+						 * insertion */ )
+{
+	carry_node *result;
+
+	result =
+	    (carry_node *) reiser4_add_obj(&level->pool->node_pool,
+					   &level->nodes,
+					   order, &reference->header);
+	if (!IS_ERR(result) && (result != NULL))
+		++level->nodes_num;
+	return result;
+}
+
+/* add new carry operation to the @level.
+
+   Returns pointer to the new carry operations allocated from pool. It's up to
+   callers to maintain proper order in the @level. To control ordering use
+   @order and @reference parameters.
+
+*/
+static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
+			pool_ordering order	/* where to insert: at the beginning of
+						 * @level, before @reference, after
+						 * @reference, at the end of @level */ ,
+			carry_op *
+			reference /* reference node for insertion */ )
+{
+	carry_op *result;
+
+	result =
+	    (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
+					 order, &reference->header);
+	if (!IS_ERR(result) && (result != NULL))
+		++level->ops_num;
+	return result;
+}
+
+/* Return node on the right of which @node was created.
+
+   Each node is created on the right of some existing node (or it is new root,
+   which is special case not handled here).
+
+   @node is new node created on some level, but not yet inserted into its
+   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
+
+*/
+static carry_node *find_begetting_brother(carry_node * node	/* node to start search
+								 * from */ ,
+					  carry_level * kin UNUSED_ARG	/* level to
+									 * scan */ )
+{
+	carry_node *scan;
+
+	assert("nikita-1614", node != NULL);
+	assert("nikita-1615", kin != NULL);
+	assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
+	assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
+				   ZF_ISSET(reiser4_carry_real(node),
+					    JNODE_ORPHAN)));
+	for (scan = node;;
+	     scan = list_entry(scan->header.level_linkage.prev, carry_node,
+			       header.level_linkage)) {
+		assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
+		if ((scan->node != node->node) &&
+		    !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
+			assert("nikita-1618", reiser4_carry_real(scan) != NULL);
+			break;
+		}
+	}
+	return scan;
+}
+
+static cmp_t
+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
+{
+	assert("nikita-2199", n1 != NULL);
+	assert("nikita-2200", n2 != NULL);
+
+	if (n1 == n2)
+		return EQUAL_TO;
+	while (1) {
+		n1 = carry_node_next(n1);
+		if (carry_node_end(level, n1))
+			return GREATER_THAN;
+		if (n1 == n2)
+			return LESS_THAN;
+	}
+	impossible("nikita-2201", "End of level reached");
+}
+
+carry_node *find_carry_node(carry_level * level, const znode * node)
+{
+	carry_node *scan;
+	carry_node *tmp_scan;
+
+	assert("nikita-2202", level != NULL);
+	assert("nikita-2203", node != NULL);
+
+	for_all_nodes(level, scan, tmp_scan) {
+		if (reiser4_carry_real(scan) == node)
+			return scan;
+	}
+	return NULL;
+}
+
+znode *reiser4_carry_real(const carry_node * node)
+{
+	assert("nikita-3061", node != NULL);
+
+	return node->lock_handle.node;
+}
+
+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
+			      const znode * node)
+{
+	carry_node *base;
+	carry_node *scan;
+	carry_node *tmp_scan;
+	carry_node *proj;
+
+	base = find_carry_node(doing, node);
+	assert("nikita-2204", base != NULL);
+
+	for_all_nodes(todo, scan, tmp_scan) {
+		proj = find_carry_node(doing, scan->node);
+		assert("nikita-2205", proj != NULL);
+		if (carry_node_cmp(doing, proj, base) != LESS_THAN)
+			break;
+	}
+	return scan;
+}
+
+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
+				     znode * node)
+{
+	carry_node *reference;
+
+	assert("nikita-2994", doing != NULL);
+	assert("nikita-2995", todo != NULL);
+	assert("nikita-2996", node != NULL);
+
+	reference = insert_carry_node(doing, todo, node);
+	assert("nikita-2997", reference != NULL);
+
+	return reiser4_add_carry(todo, POOLO_BEFORE, reference);
+}
+
+/* like reiser4_post_carry(), but designed to be called from node plugin methods.
+   This function is different from reiser4_post_carry() in that it finds proper
+   place to insert node in the queue. */
+carry_op *node_post_carry(carry_plugin_info * info	/* carry parameters
+							 * passed down to node
+							 * plugin */ ,
+			  carry_opcode op /* opcode of operation */ ,
+			  znode * node	/* node on which this
+					 * operation will operate */ ,
+			  int apply_to_parent_p	/* whether operation will
+						 * operate directly on @node
+						 * or on it parent. */ )
+{
+	carry_op *result;
+	carry_node *child;
+
+	assert("nikita-2207", info != NULL);
+	assert("nikita-2208", info->todo != NULL);
+
+	if (info->doing == NULL)
+		return reiser4_post_carry(info->todo, op, node,
+					  apply_to_parent_p);
+
+	result = add_op(info->todo, POOLO_LAST, NULL);
+	if (IS_ERR(result))
+		return result;
+	child = add_carry_atplace(info->doing, info->todo, node);
+	if (IS_ERR(child)) {
+		reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
+		return (carry_op *) child;
+	}
+	result->node = child;
+	result->op = op;
+	child->parent = apply_to_parent_p;
+	if (ZF_ISSET(node, JNODE_ORPHAN))
+		child->left_before = 1;
+	child->node = node;
+	return result;
+}
+
+/* lock all carry nodes in @level */
+static int lock_carry_level(carry_level * level /* level to lock */ )
+{
+	int result;
+	carry_node *node;
+	carry_node *tmp_node;
+
+	assert("nikita-881", level != NULL);
+	assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
+
+	/* lock nodes from left to right */
+	result = 0;
+	for_all_nodes(level, node, tmp_node) {
+		result = lock_carry_node(level, node);
+		if (result != 0)
+			break;
+	}
+	return result;
+}
+
+/* Synchronize delimiting keys between @node and its left neighbor.
+
+   To reduce contention on dk key and simplify carry code, we synchronize
+   delimiting keys only when carry ultimately leaves tree level (carrying
+   changes upward) and unlocks nodes at this level.
+
+   This function first finds left neighbor of @node and then updates left
+   neighbor's right delimiting key to conincide with least key in @node.
+
+*/
+
+ON_DEBUG(extern atomic_t delim_key_version;
+    )
+
+static void sync_dkeys(znode * spot /* node to update */ )
+{
+	reiser4_key pivot;
+	reiser4_tree *tree;
+
+	assert("nikita-1610", spot != NULL);
+	assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
+
+	tree = znode_get_tree(spot);
+	read_lock_tree(tree);
+	write_lock_dk(tree);
+
+	assert("nikita-2192", znode_is_loaded(spot));
+
+	/* sync left delimiting key of @spot with key in its leftmost item */
+	if (node_is_empty(spot))
+		pivot = *znode_get_rd_key(spot);
+	else
+		leftmost_key_in_node(spot, &pivot);
+
+	znode_set_ld_key(spot, &pivot);
+
+	/* there can be sequence of empty nodes pending removal on the left of
+	   @spot. Scan them and update their left and right delimiting keys to
+	   match left delimiting key of @spot. Also, update right delimiting
+	   key of first non-empty left neighbor.
+	 */
+	while (1) {
+		if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
+			break;
+
+		spot = spot->left;
+		if (spot == NULL)
+			break;
+
+		znode_set_rd_key(spot, &pivot);
+		/* don't sink into the domain of another balancing */
+		if (!znode_is_write_locked(spot))
+			break;
+		if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
+			znode_set_ld_key(spot, &pivot);
+		else
+			break;
+	}
+
+	write_unlock_dk(tree);
+	read_unlock_tree(tree);
+}
+
+/* unlock all carry nodes in @level */
+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
+			       int failure	/* true if unlocking owing to
+						 * failure */ )
+{
+	carry_node *node;
+	carry_node *tmp_node;
+
+	assert("nikita-889", level != NULL);
+
+	if (!failure) {
+		znode *spot;
+
+		spot = NULL;
+		/* update delimiting keys */
+		for_all_nodes(level, node, tmp_node) {
+			if (reiser4_carry_real(node) != spot) {
+				spot = reiser4_carry_real(node);
+				sync_dkeys(spot);
+			}
+		}
+	}
+
+	/* nodes can be unlocked in arbitrary order.  In preemptible
+	   environment it's better to unlock in reverse order of locking,
+	   though.
+	 */
+	for_all_nodes_back(level, node, tmp_node) {
+		/* all allocated nodes should be already linked to their
+		   parents at this moment. */
+		assert("nikita-1631",
+		       ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
+						JNODE_ORPHAN)));
+		ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
+		unlock_carry_node(level, node, failure);
+	}
+	level->new_root = NULL;
+}
+
+/* finish with @level
+
+   Unlock nodes and release all allocated resources */
+static void done_carry_level(carry_level * level /* level to finish */ )
+{
+	carry_node *node;
+	carry_node *tmp_node;
+	carry_op *op;
+	carry_op *tmp_op;
+
+	assert("nikita-1076", level != NULL);
+
+	unlock_carry_level(level, 0);
+	for_all_nodes(level, node, tmp_node) {
+		assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
+		assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
+		reiser4_pool_free(&level->pool->node_pool, &node->header);
+	}
+	for_all_ops(level, op, tmp_op)
+	    reiser4_pool_free(&level->pool->op_pool, &op->header);
+}
+
+/* helper function to complete locking of carry node
+
+   Finish locking of carry node. There are several ways in which new carry
+   node can be added into carry level and locked. Normal is through
+   lock_carry_node(), but also from find_{left|right}_neighbor(). This
+   function factors out common final part of all locking scenarios. It
+   supposes that @node -> lock_handle is lock handle for lock just taken and
+   fills ->real_node from this lock handle.
+
+*/
+int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
+{
+	assert("nikita-1052", node != NULL);
+	assert("nikita-1187", reiser4_carry_real(node) != NULL);
+	assert("nikita-1188", !node->unlock);
+
+	node->unlock = 1;
+	/* Load node content into memory and install node plugin by
+	   looking at the node header.
+
+	   Most of the time this call is cheap because the node is
+	   already in memory.
+
+	   Corresponding zrelse() is in unlock_carry_node()
+	 */
+	return zload(reiser4_carry_real(node));
+}
+
+/* lock carry node
+
+   "Resolve" node to real znode, lock it and mark as locked.
+   This requires recursive locking of znodes.
+
+   When operation is posted to the parent level, node it will be applied to is
+   not yet known. For example, when shifting data between two nodes,
+   delimiting has to be updated in parent or parents of nodes involved. But
+   their parents is not yet locked and, moreover said nodes can be reparented
+   by concurrent balancing.
+
+   To work around this, carry operation is applied to special "carry node"
+   rather than to the znode itself. Carry node consists of some "base" or
+   "reference" znode and flags indicating how to get to the target of carry
+   operation (->real_node field of carry_node) from base.
+
+*/
+int lock_carry_node(carry_level * level /* level @node is in */ ,
+		    carry_node * node /* node to lock */ )
+{
+	int result;
+	znode *reference_point;
+	lock_handle lh;
+	lock_handle tmp_lh;
+	reiser4_tree *tree;
+
+	assert("nikita-887", level != NULL);
+	assert("nikita-882", node != NULL);
+
+	result = 0;
+	reference_point = node->node;
+	init_lh(&lh);
+	init_lh(&tmp_lh);
+	if (node->left_before) {
+		/* handling of new nodes, allocated on the previous level:
+
+		   some carry ops were propably posted from the new node, but
+		   this node neither has parent pointer set, nor is
+		   connected. This will be done in ->create_hook() for
+		   internal item.
+
+		   No then less, parent of new node has to be locked. To do
+		   this, first go to the "left" in the carry order. This
+		   depends on the decision to always allocate new node on the
+		   right of existing one.
+
+		   Loop handles case when multiple nodes, all orphans, were
+		   inserted.
+
+		   Strictly speaking, taking tree lock is not necessary here,
+		   because all nodes scanned by loop in
+		   find_begetting_brother() are write-locked by this thread,
+		   and thus, their sibling linkage cannot change.
+
+		 */
+		tree = znode_get_tree(reference_point);
+		read_lock_tree(tree);
+		reference_point = find_begetting_brother(node, level)->node;
+		read_unlock_tree(tree);
+		assert("nikita-1186", reference_point != NULL);
+	}
+	if (node->parent && (result == 0)) {
+		result =
+		    reiser4_get_parent(&tmp_lh, reference_point,
+				       ZNODE_WRITE_LOCK);
+		if (result != 0) {
+			;	/* nothing */
+		} else if (znode_get_level(tmp_lh.node) == 0) {
+			assert("nikita-1347", znode_above_root(tmp_lh.node));
+			result = add_new_root(level, node, tmp_lh.node);
+			if (result == 0) {
+				reference_point = level->new_root;
+				move_lh(&lh, &node->lock_handle);
+			}
+		} else if ((level->new_root != NULL)
+			   && (level->new_root !=
+			       znode_parent_nolock(reference_point))) {
+			/* parent of node exists, but this level aready
+			   created different new root, so */
+			warning("nikita-1109",
+				/* it should be "radicis", but tradition is
+				   tradition.  do banshees read latin? */
+				"hodie natus est radici frater");
+			result = -EIO;
+		} else {
+			move_lh(&lh, &tmp_lh);
+			reference_point = lh.node;
+		}
+	}
+	if (node->left && (result == 0)) {
+		assert("nikita-1183", node->parent);
+		assert("nikita-883", reference_point != NULL);
+		result =
+		    reiser4_get_left_neighbor(&tmp_lh, reference_point,
+					      ZNODE_WRITE_LOCK,
+					      GN_CAN_USE_UPPER_LEVELS);
+		if (result == 0) {
+			done_lh(&lh);
+			move_lh(&lh, &tmp_lh);
+			reference_point = lh.node;
+		}
+	}
+	if (!node->parent && !node->left && !node->left_before) {
+		result =
+		    longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
+					ZNODE_LOCK_HIPRI);
+	}
+	if (result == 0) {
+		move_lh(&node->lock_handle, &lh);
+		result = lock_carry_node_tail(node);
+	}
+	done_lh(&tmp_lh);
+	done_lh(&lh);
+	return result;
+}
+
+/* release a lock on &carry_node.
+
+   Release if necessary lock on @node. This opearion is pair of
+   lock_carry_node() and is idempotent: you can call it more than once on the
+   same node.
+
+*/
+static void
+unlock_carry_node(carry_level * level,
+		  carry_node * node /* node to be released */ ,
+		  int failure	/* 0 if node is unlocked due
+				 * to some error */ )
+{
+	znode *real_node;
+
+	assert("nikita-884", node != NULL);
+
+	real_node = reiser4_carry_real(node);
+	/* pair to zload() in lock_carry_node_tail() */
+	zrelse(real_node);
+	if (node->unlock && (real_node != NULL)) {
+		assert("nikita-899", real_node == node->lock_handle.node);
+		longterm_unlock_znode(&node->lock_handle);
+	}
+	if (failure) {
+		if (node->deallocate && (real_node != NULL)) {
+			/* free node in bitmap
+
+			   Prepare node for removal. Last zput() will finish
+			   with it.
+			 */
+			ZF_SET(real_node, JNODE_HEARD_BANSHEE);
+		}
+		if (node->free) {
+			assert("nikita-2177",
+			       list_empty_careful(&node->lock_handle.locks_link));
+			assert("nikita-2112",
+			       list_empty_careful(&node->lock_handle.owners_link));
+			reiser4_pool_free(&level->pool->node_pool,
+					  &node->header);
+		}
+	}
+}
+
+/* fatal_carry_error() - all-catching error handling function
+
+   It is possible that carry faces unrecoverable error, like unability to
+   insert pointer at the internal level. Our simple solution is just panic in
+   this situation. More sophisticated things like attempt to remount
+   file-system as read-only can be implemented without much difficlties.
+
+   It is believed, that:
+
+   1. in stead of panicking, all current transactions can be aborted rolling
+   system back to the consistent state.
+
+Umm, if you simply panic without doing anything more at all, then all current
+transactions are aborted and the system is rolled back to a consistent state,
+by virtue of the design of the transactional mechanism. Well, wait, let's be
+precise.  If an internal node is corrupted on disk due to hardware failure,
+then there may be no consistent state that can be rolled back to, so instead
+we should say that it will rollback the transactions, which barring other
+factors means rolling back to a consistent state.
+
+# Nikita: there is a subtle difference between panic and aborting
+# transactions: machine doesn't reboot. Processes aren't killed. Processes
+# don't using reiser4 (not that we care about such processes), or using other
+# reiser4 mounts (about them we do care) will simply continue to run. With
+# some luck, even application using aborted file system can survive: it will
+# get some error, like EBADF, from each file descriptor on failed file system,
+# but applications that do care about tolerance will cope with this (squid
+# will).
+
+It would be a nice feature though to support rollback without rebooting
+followed by remount, but this can wait for later versions.
+
+   2. once isolated transactions will be implemented it will be possible to
+   roll back offending transaction.
+
+2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
+it more before deciding if it should be done.  -Hans
+
+*/
+static void fatal_carry_error(carry_level * doing UNUSED_ARG	/* carry level
+								 * where
+								 * unrecoverable
+								 * error
+								 * occurred */ ,
+			      int ecode /* error code */ )
+{
+	assert("nikita-1230", doing != NULL);
+	assert("nikita-1231", ecode < 0);
+
+	reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
+}
+
+/* add new root to the tree
+
+   This function itself only manages changes in carry structures and delegates
+   all hard work (allocation of znode for new root, changes of parent and
+   sibling pointers to the reiser4_add_tree_root().
+
+   Locking: old tree root is locked by carry at this point. Fake znode is also
+   locked.
+
+*/
+static int add_new_root(carry_level * level	/* carry level in context of which
+						 * operation is performed */ ,
+			carry_node * node /* carry node for existing root */ ,
+			znode * fake	/* "fake" znode already locked by
+					 * us */ )
+{
+	int result;
+
+	assert("nikita-1104", level != NULL);
+	assert("nikita-1105", node != NULL);
+
+	assert("nikita-1403", znode_is_write_locked(node->node));
+	assert("nikita-1404", znode_is_write_locked(fake));
+
+	/* trying to create new root. */
+	/* @node is root and it's already locked by us. This
+	   means that nobody else can be trying to add/remove
+	   tree root right now.
+	 */
+	if (level->new_root == NULL)
+		level->new_root = reiser4_add_tree_root(node->node, fake);
+	if (!IS_ERR(level->new_root)) {
+		assert("nikita-1210", znode_is_root(level->new_root));
+		node->deallocate = 1;
+		result =
+		    longterm_lock_znode(&node->lock_handle, level->new_root,
+					ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
+		if (result == 0)
+			zput(level->new_root);
+	} else {
+		result = PTR_ERR(level->new_root);
+		level->new_root = NULL;
+	}
+	return result;
+}
+
+/* allocate new znode and add the operation that inserts the
+   pointer to it into the parent node into the todo level
+
+   Allocate new znode, add it into carry queue and post into @todo queue
+   request to add pointer to new node into its parent.
+
+   This is carry related routing that calls reiser4_new_node() to allocate new
+   node.
+*/
+carry_node *add_new_znode(znode * brother	/* existing left neighbor of new
+						 * node */ ,
+			  carry_node * ref	/* carry node after which new
+						 * carry node is to be inserted
+						 * into queue. This affects
+						 * locking. */ ,
+			  carry_level * doing	/* carry queue where new node is
+						 * to be added */ ,
+			  carry_level * todo	/* carry queue where COP_INSERT
+						 * operation to add pointer to
+						 * new node will ne added */ )
+{
+	carry_node *fresh;
+	znode *new_znode;
+	carry_op *add_pointer;
+	carry_plugin_info info;
+
+	assert("nikita-1048", brother != NULL);
+	assert("nikita-1049", todo != NULL);
+
+	/* There is a lot of possible variations here: to what parent
+	   new node will be attached and where. For simplicity, always
+	   do the following:
+
+	   (1) new node and @brother will have the same parent.
+
+	   (2) new node is added on the right of @brother
+
+	 */
+
+	fresh =	reiser4_add_carry_skip(doing,
+				       ref ? POOLO_AFTER : POOLO_LAST, ref);
+	if (IS_ERR(fresh))
+		return fresh;
+
+	fresh->deallocate = 1;
+	fresh->free = 1;
+
+	new_znode = reiser4_new_node(brother, znode_get_level(brother));
+	if (IS_ERR(new_znode))
+		/* @fresh will be deallocated automatically by error
+		   handling code in the caller. */
+		return (carry_node *) new_znode;
+
+	/* new_znode returned znode with x_count 1. Caller has to decrease
+	   it. make_space() does. */
+
+	ZF_SET(new_znode, JNODE_ORPHAN);
+	fresh->node = new_znode;
+
+	while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
+		ref = carry_node_prev(ref);
+		assert("nikita-1606", !carry_node_end(doing, ref));
+	}
+
+	info.todo = todo;
+	info.doing = doing;
+	add_pointer = node_post_carry(&info, COP_INSERT,
+				      reiser4_carry_real(ref), 1);
+	if (IS_ERR(add_pointer)) {
+		/* no need to deallocate @new_znode here: it will be
+		   deallocated during carry error handling. */
+		return (carry_node *) add_pointer;
+	}
+
+	add_pointer->u.insert.type = COPT_CHILD;
+	add_pointer->u.insert.child = fresh;
+	add_pointer->u.insert.brother = brother;
+	/* initially new node spawns empty key range */
+	write_lock_dk(znode_get_tree(brother));
+	znode_set_ld_key(new_znode,
+			 znode_set_rd_key(new_znode,
+					  znode_get_rd_key(brother)));
+	write_unlock_dk(znode_get_tree(brother));
+	return fresh;
+}
+
+/* DEBUGGING FUNCTIONS.
+
+   Probably we also should leave them on even when
+   debugging is turned off to print dumps at errors.
+*/
+#if REISER4_DEBUG
+static int carry_level_invariant(carry_level * level, carry_queue_state state)
+{
+	carry_node *node;
+	carry_node *tmp_node;
+
+	if (level == NULL)
+		return 0;
+
+	if (level->track_type != 0 &&
+	    level->track_type != CARRY_TRACK_NODE &&
+	    level->track_type != CARRY_TRACK_CHANGE)
+		return 0;
+
+	/* check that nodes are in ascending order */
+	for_all_nodes(level, node, tmp_node) {
+		znode *left;
+		znode *right;
+
+		reiser4_key lkey;
+		reiser4_key rkey;
+
+		if (node != carry_node_front(level)) {
+			if (state == CARRY_TODO) {
+				right = node->node;
+				left = carry_node_prev(node)->node;
+			} else {
+				right = reiser4_carry_real(node);
+				left = reiser4_carry_real(carry_node_prev(node));
+			}
+			if (right == NULL || left == NULL)
+				continue;
+			if (node_is_empty(right) || node_is_empty(left))
+				continue;
+			if (!keyle(leftmost_key_in_node(left, &lkey),
+				   leftmost_key_in_node(right, &rkey))) {
+				warning("", "wrong key order");
+				return 0;
+			}
+		}
+	}
+	return 1;
+}
+#endif
+
+/* get symbolic name for boolean */
+static const char *tf(int boolean /* truth value */ )
+{
+	return boolean ? "t" : "f";
+}
+
+/* symbolic name for carry operation */
+static const char *carry_op_name(carry_opcode op /* carry opcode */ )
+{
+	switch (op) {
+	case COP_INSERT:
+		return "COP_INSERT";
+	case COP_DELETE:
+		return "COP_DELETE";
+	case COP_CUT:
+		return "COP_CUT";
+	case COP_PASTE:
+		return "COP_PASTE";
+	case COP_UPDATE:
+		return "COP_UPDATE";
+	case COP_EXTENT:
+		return "COP_EXTENT";
+	case COP_INSERT_FLOW:
+		return "COP_INSERT_FLOW";
+	default:{
+			/* not mt safe, but who cares? */
+			static char buf[20];
+
+			sprintf(buf, "unknown op: %x", op);
+			return buf;
+		}
+	}
+}
+
+/* dump information about carry node */
+static void print_carry(const char *prefix /* prefix to print */ ,
+			carry_node * node /* node to print */ )
+{
+	if (node == NULL) {
+		printk("%s: null\n", prefix);
+		return;
+	}
+	printk
+	    ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
+	     prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
+	     tf(node->free), tf(node->deallocate));
+}
+
+/* dump information about carry operation */
+static void print_op(const char *prefix /* prefix to print */ ,
+		     carry_op * op /* operation to print */ )
+{
+	if (op == NULL) {
+		printk("%s: null\n", prefix);
+		return;
+	}
+	printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
+	print_carry("\tnode", op->node);
+	switch (op->op) {
+	case COP_INSERT:
+	case COP_PASTE:
+		print_coord("\tcoord",
+			    op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
+		reiser4_print_key("\tkey",
+				  op->u.insert.d ? op->u.insert.d->key : NULL);
+		print_carry("\tchild", op->u.insert.child);
+		break;
+	case COP_DELETE:
+		print_carry("\tchild", op->u.delete.child);
+		break;
+	case COP_CUT:
+		if (op->u.cut_or_kill.is_cut) {
+			print_coord("\tfrom",
+				    op->u.cut_or_kill.u.kill->params.from, 0);
+			print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
+				    0);
+		} else {
+			print_coord("\tfrom",
+				    op->u.cut_or_kill.u.cut->params.from, 0);
+			print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
+				    0);
+		}
+		break;
+	case COP_UPDATE:
+		print_carry("\tleft", op->u.update.left);
+		break;
+	default:
+		/* do nothing */
+		break;
+	}
+}
+
+/* dump information about all nodes and operations in a @level */
+static void print_level(const char *prefix /* prefix to print */ ,
+			carry_level * level /* level to print */ )
+{
+	carry_node *node;
+	carry_node *tmp_node;
+	carry_op *op;
+	carry_op *tmp_op;
+
+	if (level == NULL) {
+		printk("%s: null\n", prefix);
+		return;
+	}
+	printk("%s: %p, restartable: %s\n",
+	       prefix, level, tf(level->restartable));
+
+	for_all_nodes(level, node, tmp_node)
+	    print_carry("\tcarry node", node);
+	for_all_ops(level, op, tmp_op)
+	    print_op("\tcarry op", op);
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/carry.h linux-2.6.22/fs/reiser4/carry.h
--- linux-2.6.22.orig/fs/reiser4/carry.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/carry.h	2007-07-29 00:25:34.824683017 +0400
@@ -0,0 +1,442 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Functions and data types to "carry" tree modification(s) upward.
+   See fs/reiser4/carry.c for details. */
+
+#if !defined( __FS_REISER4_CARRY_H__ )
+#define __FS_REISER4_CARRY_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "pool.h"
+#include "znode.h"
+
+#include <linux/types.h>
+
+/* &carry_node - "location" of carry node.
+
+   "location" of node that is involved or going to be involved into
+   carry process. Node where operation will be carried to on the
+   parent level cannot be recorded explicitly. Operation will be carried
+   usually to the parent of some node (where changes are performed at
+   the current level) or, to the left neighbor of its parent. But while
+   modifications are performed at the current level, parent may
+   change. So, we have to allow some indirection (or, positevly,
+   flexibility) in locating carry nodes.
+
+*/
+typedef struct carry_node {
+	/* pool linkage */
+	struct reiser4_pool_header header;
+
+	/* base node from which real_node is calculated. See
+	   fs/reiser4/carry.c:lock_carry_node(). */
+	znode *node;
+
+	/* how to get ->real_node */
+	/* to get ->real_node obtain parent of ->node */
+	__u32 parent:1;
+	/* to get ->real_node obtain left neighbor of parent of
+	   ->node */
+	__u32 left:1;
+	__u32 left_before:1;
+
+	/* locking */
+
+	/* this node was locked by carry process and should be
+	   unlocked when carry leaves a level */
+	__u32 unlock:1;
+
+	/* disk block for this node was allocated by carry process and
+	   should be deallocated when carry leaves a level */
+	__u32 deallocate:1;
+	/* this carry node was allocated by carry process and should be
+	   freed when carry leaves a level */
+	__u32 free:1;
+
+	/* type of lock we want to take on this node */
+	lock_handle lock_handle;
+} carry_node;
+
+/* &carry_opcode - elementary operations that can be carried upward
+
+   Operations that carry() can handle. This list is supposed to be
+   expanded.
+
+   Each carry operation (cop) is handled by appropriate function defined
+   in fs/reiser4/carry.c. For example COP_INSERT is handled by
+   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
+   call plugins of nodes affected by operation to modify nodes' content
+   and to gather operations to be performed on the next level.
+
+*/
+typedef enum {
+	/* insert new item into node. */
+	COP_INSERT,
+	/* delete pointer from parent node */
+	COP_DELETE,
+	/* remove part of or whole node. */
+	COP_CUT,
+	/* increase size of item. */
+	COP_PASTE,
+	/* insert extent (that is sequence of unformatted nodes). */
+	COP_EXTENT,
+	/* update delimiting key in least common ancestor of two
+	   nodes. This is performed when items are moved between two
+	   nodes.
+	 */
+	COP_UPDATE,
+	/* insert flow */
+	COP_INSERT_FLOW,
+	COP_LAST_OP,
+} carry_opcode;
+
+#define CARRY_FLOW_NEW_NODES_LIMIT 20
+
+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
+   item is determined. */
+typedef enum {
+	/* target item is one containing pointer to the ->child node */
+	COPT_CHILD,
+	/* target item is given explicitly by @coord */
+	COPT_ITEM_DATA,
+	/* target item is given by key */
+	COPT_KEY,
+	/* see insert_paste_common() for more comments on this. */
+	COPT_PASTE_RESTARTED,
+} cop_insert_pos_type;
+
+/* flags to cut and delete */
+typedef enum {
+	/* don't kill node even if it became completely empty as results of
+	 * cut. This is needed for eottl handling. See carry_extent() for
+	 * details. */
+	DELETE_RETAIN_EMPTY = (1 << 0)
+} cop_delete_flag;
+
+/*
+ * carry() implements "lock handle tracking" feature.
+ *
+ * Callers supply carry with node where to perform initial operation and lock
+ * handle on this node. Trying to optimize node utilization carry may actually
+ * move insertion point to different node. Callers expect that lock handle
+ * will rebe transferred to the new node also.
+ *
+ */
+typedef enum {
+	/* transfer lock handle along with insertion point */
+	CARRY_TRACK_CHANGE = 1,
+	/* acquire new lock handle to the node where insertion point is. This
+	 * is used when carry() client doesn't initially possess lock handle
+	 * on the insertion point node, for example, by extent insertion
+	 * code. See carry_extent(). */
+	CARRY_TRACK_NODE = 2
+} carry_track_type;
+
+/* data supplied to COP_{INSERT|PASTE} by callers */
+typedef struct carry_insert_data {
+	/* position where new item is to be inserted */
+	coord_t *coord;
+	/* new item description */
+	reiser4_item_data *data;
+	/* key of new item */
+	const reiser4_key *key;
+} carry_insert_data;
+
+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
+struct cut_kill_params {
+	/* coord where cut starts (inclusive) */
+	coord_t *from;
+	/* coord where cut stops (inclusive, this item/unit will also be
+	 * cut) */
+	coord_t *to;
+	/* starting key. This is necessary when item and unit pos don't
+	 * uniquely identify what portion or tree to remove. For example, this
+	 * indicates what portion of extent unit will be affected. */
+	const reiser4_key *from_key;
+	/* exclusive stop key */
+	const reiser4_key *to_key;
+	/* if this is not NULL, smallest actually removed key is stored
+	 * here. */
+	reiser4_key *smallest_removed;
+	/* kill_node_content()  is called for file truncate */
+	int truncate;
+};
+
+struct carry_cut_data {
+	struct cut_kill_params params;
+};
+
+struct carry_kill_data {
+	struct cut_kill_params params;
+	/* parameter to be passed to the ->kill_hook() method of item
+	 * plugin */
+	/*void *iplug_params; *//* FIXME: unused currently */
+	/* if not NULL---inode whose items are being removed. This is needed
+	 * for ->kill_hook() of extent item to update VM structures when
+	 * removing pages. */
+	struct inode *inode;
+	/* sibling list maintenance is complicated by existence of eottl. When
+	 * eottl whose left and right neighbors are formatted leaves is
+	 * removed, one has to connect said leaves in the sibling list. This
+	 * cannot be done when extent removal is just started as locking rules
+	 * require sibling list update to happen atomically with removal of
+	 * extent item. Therefore: 1. pointers to left and right neighbors
+	 * have to be passed down to the ->kill_hook() of extent item, and
+	 * 2. said neighbors have to be locked. */
+	lock_handle *left;
+	lock_handle *right;
+	/* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
+	unsigned flags;
+	char *buf;
+};
+
+/* &carry_tree_op - operation to "carry" upward.
+
+   Description of an operation we want to "carry" to the upper level of
+   a tree: e.g, when we insert something and there is not enough space
+   we allocate a new node and "carry" the operation of inserting a
+   pointer to the new node to the upper level, on removal of empty node,
+   we carry up operation of removing appropriate entry from parent.
+
+   There are two types of carry ops: when adding or deleting node we
+   node at the parent level where appropriate modification has to be
+   performed is known in advance. When shifting items between nodes
+   (split, merge), delimiting key should be changed in the least common
+   parent of the nodes involved that is not known in advance.
+
+   For the operations of the first type we store in &carry_op pointer to
+   the &carry_node at the parent level. For the operation of the second
+   type we store &carry_node or parents of the left and right nodes
+   modified and keep track of them upward until they coincide.
+
+*/
+typedef struct carry_op {
+	/* pool linkage */
+	struct reiser4_pool_header header;
+	carry_opcode op;
+	/* node on which operation is to be performed:
+
+	   for insert, paste: node where new item is to be inserted
+
+	   for delete: node where pointer is to be deleted
+
+	   for cut: node to cut from
+
+	   for update: node where delimiting key is to be modified
+
+	   for modify: parent of modified node
+
+	 */
+	carry_node *node;
+	union {
+		struct {
+			/* (sub-)type of insertion/paste. Taken from
+			   cop_insert_pos_type. */
+			__u8 type;
+			/* various operation flags. Taken from
+			   cop_insert_flag. */
+			__u8 flags;
+			carry_insert_data *d;
+			carry_node *child;
+			znode *brother;
+		} insert, paste, extent;
+
+		struct {
+			int is_cut;
+			union {
+				carry_kill_data *kill;
+				carry_cut_data *cut;
+			} u;
+		} cut_or_kill;
+
+		struct {
+			carry_node *left;
+		} update;
+		struct {
+			/* changed child */
+			carry_node *child;
+			/* bitmask of changes. See &cop_modify_flag */
+			__u32 flag;
+		} modify;
+		struct {
+			/* flags to deletion operation. Are taken from
+			   cop_delete_flag */
+			__u32 flags;
+			/* child to delete from parent. If this is
+			   NULL, delete op->node.  */
+			carry_node *child;
+		} delete;
+		struct {
+			/* various operation flags. Taken from
+			   cop_insert_flag. */
+			__u32 flags;
+			flow_t *flow;
+			coord_t *insert_point;
+			reiser4_item_data *data;
+			/* flow insertion is limited by number of new blocks
+			   added in that operation which do not get any data
+			   but part of flow. This limit is set by macro
+			   CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
+			   of nodes added already during one carry_flow */
+			int new_nodes;
+		} insert_flow;
+	} u;
+} carry_op;
+
+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
+typedef struct carry_pool {
+	carry_op op[CARRIES_POOL_SIZE];
+	struct reiser4_pool op_pool;
+	carry_node node[NODES_LOCKED_POOL_SIZE];
+	struct reiser4_pool node_pool;
+} carry_pool;
+
+/* &carry_tree_level - carry process on given level
+
+   Description of balancing process on the given level.
+
+   No need for locking here, as carry_tree_level is essentially per
+   thread thing (for now).
+
+*/
+struct carry_level {
+	/* this level may be restarted */
+	__u32 restartable:1;
+	/* list of carry nodes on this level, ordered by key order */
+	struct list_head nodes;
+	struct list_head ops;
+	/* pool where new objects are allocated from */
+	carry_pool *pool;
+	int ops_num;
+	int nodes_num;
+	/* new root created on this level, if any */
+	znode *new_root;
+	/* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
+	   when they want ->tracked to automagically wander to the node where
+	   insertion point moved after insert or paste.
+	 */
+	carry_track_type track_type;
+	/* lock handle supplied by user that we are tracking. See
+	   above. */
+	lock_handle *tracked;
+};
+
+/* information carry passes to plugin methods that may add new operations to
+   the @todo queue  */
+struct carry_plugin_info {
+	carry_level *doing;
+	carry_level *todo;
+};
+
+int reiser4_carry(carry_level * doing, carry_level * done);
+
+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
+			      carry_node * reference);
+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
+				   carry_node * reference);
+
+extern carry_node *insert_carry_node(carry_level * doing,
+				     carry_level * todo, const znode * node);
+
+extern carry_pool *init_carry_pool(int);
+extern void done_carry_pool(carry_pool * pool);
+
+extern void init_carry_level(carry_level * level, carry_pool * pool);
+
+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
+				    znode * node, int apply_to_parent);
+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
+				 znode * node, int apply_to_parent_p);
+
+carry_node *add_new_znode(znode * brother, carry_node * reference,
+			  carry_level * doing, carry_level * todo);
+
+carry_node *find_carry_node(carry_level * level, const znode * node);
+
+extern znode *reiser4_carry_real(const carry_node * node);
+
+/* helper macros to iterate over carry queues */
+
+#define carry_node_next( node )					\
+	list_entry((node)->header.level_linkage.next, carry_node,	\
+		   header.level_linkage)
+
+#define carry_node_prev( node )					\
+	list_entry((node)->header.level_linkage.prev, carry_node,	\
+		   header.level_linkage)
+
+#define carry_node_front( level )						\
+	list_entry((level)->nodes.next, carry_node, header.level_linkage)
+
+#define carry_node_back( level )						\
+	list_entry((level)->nodes.prev, carry_node, header.level_linkage)
+
+#define carry_node_end( level, node )				\
+	(&(level)->nodes == &(node)->header.level_linkage)
+
+/* macro to iterate over all operations in a @level */
+#define for_all_ops( level /* carry level (of type carry_level *) */,			\
+		     op    /* pointer to carry operation, modified by loop (of 		\
+			    * type carry_op *) */,					\
+		     tmp   /* pointer to carry operation (of type carry_op *), 		\
+			    * used to make iterator stable in the face of 		\
+			    * deletions from the level */ )				\
+for (op = list_entry(level->ops.next, carry_op, header.level_linkage),			\
+     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); 	\
+     &op->header.level_linkage != &level->ops;						\
+     op = tmp,										\
+     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
+
+#if 0
+for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),		\
+     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;		\
+     ! pool_level_list_end( &level -> ops, &op -> header ) ;			\
+     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
+#endif
+
+/* macro to iterate over all nodes in a @level */						\
+#define for_all_nodes( level /* carry level (of type carry_level *) */,				\
+		       node  /* pointer to carry node, modified by loop (of 			\
+			      * type carry_node *) */,						\
+		       tmp   /* pointer to carry node (of type carry_node *), 			\
+			      * used to make iterator stable in the face of * 			\
+			      * deletions from the level */ )					\
+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),			\
+     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); 	\
+     &node->header.level_linkage != &level->nodes;						\
+     node = tmp, 										\
+     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
+
+#if 0
+for( node = carry_node_front( level ),						\
+     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;		\
+     node = tmp, tmp = carry_node_next( node ) )
+#endif
+
+/* macro to iterate over all nodes in a @level in reverse order
+
+   This is used, because nodes are unlocked in reversed order of locking */
+#define for_all_nodes_back( level /* carry level (of type carry_level *) */,	\
+		            node  /* pointer to carry node, modified by loop	\
+				   * (of type carry_node *) */,			\
+		            tmp   /* pointer to carry node (of type carry_node	\
+				   * *), used to make iterator stable in the	\
+				   * face of deletions from the level */ )	\
+for( node = carry_node_back( level ),		\
+     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;		\
+     node = tmp, tmp = carry_node_prev( node ) )
+
+/* __FS_REISER4_CARRY_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/carry_ops.c linux-2.6.22/fs/reiser4/carry_ops.c
--- linux-2.6.22.orig/fs/reiser4/carry_ops.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/carry_ops.c	2007-07-29 00:25:34.828684053 +0400
@@ -0,0 +1,2131 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* implementation of carry operations */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "pool.h"
+#include "tree_mod.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "tree.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
+			    carry_level * doing, carry_level * todo,
+			    unsigned int including_insert_coord_p);
+
+extern int lock_carry_node(carry_level * level, carry_node * node);
+extern int lock_carry_node_tail(carry_node * node);
+
+/* find left neighbor of a carry node
+
+   Look for left neighbor of @node and add it to the @doing queue. See
+   comments in the body.
+
+*/
+static carry_node *find_left_neighbor(carry_op * op	/* node to find left
+							 * neighbor of */ ,
+				      carry_level * doing /* level to scan */ )
+{
+	int result;
+	carry_node *node;
+	carry_node *left;
+	int flags;
+	reiser4_tree *tree;
+
+	node = op->node;
+
+	tree = current_tree;
+	read_lock_tree(tree);
+	/* first, check whether left neighbor is already in a @doing queue */
+	if (reiser4_carry_real(node)->left != NULL) {
+		/* NOTE: there is locking subtlety here. Look into
+		 * find_right_neighbor() for more info */
+		if (find_carry_node(doing,
+				    reiser4_carry_real(node)->left) != NULL) {
+			read_unlock_tree(tree);
+			left = node;
+			do {
+				left = list_entry(left->header.level_linkage.prev,
+						  carry_node, header.level_linkage);
+				assert("nikita-3408", !carry_node_end(doing,
+								      left));
+			} while (reiser4_carry_real(left) ==
+				 reiser4_carry_real(node));
+			return left;
+		}
+	}
+	read_unlock_tree(tree);
+
+	left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
+	if (IS_ERR(left))
+		return left;
+
+	left->node = node->node;
+	left->free = 1;
+
+	flags = GN_TRY_LOCK;
+	if (!op->u.insert.flags & COPI_LOAD_LEFT)
+		flags |= GN_NO_ALLOC;
+
+	/* then, feeling lucky, peek left neighbor in the cache. */
+	result = reiser4_get_left_neighbor(&left->lock_handle,
+					   reiser4_carry_real(node),
+					   ZNODE_WRITE_LOCK, flags);
+	if (result == 0) {
+		/* ok, node found and locked. */
+		result = lock_carry_node_tail(left);
+		if (result != 0)
+			left = ERR_PTR(result);
+	} else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
+		/* node is leftmost node in a tree, or neighbor wasn't in
+		   cache, or there is an extent on the left. */
+		reiser4_pool_free(&doing->pool->node_pool, &left->header);
+		left = NULL;
+	} else if (doing->restartable) {
+		/* if left neighbor is locked, and level is restartable, add
+		   new node to @doing and restart. */
+		assert("nikita-913", node->parent != 0);
+		assert("nikita-914", node->node != NULL);
+		left->left = 1;
+		left->free = 0;
+		left = ERR_PTR(-E_REPEAT);
+	} else {
+		/* left neighbor is locked, level cannot be restarted. Just
+		   ignore left neighbor. */
+		reiser4_pool_free(&doing->pool->node_pool, &left->header);
+		left = NULL;
+	}
+	return left;
+}
+
+/* find right neighbor of a carry node
+
+   Look for right neighbor of @node and add it to the @doing queue. See
+   comments in the body.
+
+*/
+static carry_node *find_right_neighbor(carry_op * op	/* node to find right
+							 * neighbor of */ ,
+				       carry_level * doing /* level to scan */ )
+{
+	int result;
+	carry_node *node;
+	carry_node *right;
+	lock_handle lh;
+	int flags;
+	reiser4_tree *tree;
+
+	init_lh(&lh);
+
+	node = op->node;
+
+	tree = current_tree;
+	read_lock_tree(tree);
+	/* first, check whether right neighbor is already in a @doing queue */
+	if (reiser4_carry_real(node)->right != NULL) {
+		/*
+		 * Tree lock is taken here anyway, because, even if _outcome_
+		 * of (find_carry_node() != NULL) doesn't depends on
+		 * concurrent updates to ->right, find_carry_node() cannot
+		 * work with second argument NULL. Hence, following comment is
+		 * of historic importance only.
+		 *
+		 * Subtle:
+		 *
+		 * Q: why don't we need tree lock here, looking for the right
+		 * neighbor?
+		 *
+		 * A: even if value of node->real_node->right were changed
+		 * during find_carry_node() execution, outcome of execution
+		 * wouldn't change, because (in short) other thread cannot add
+		 * elements to the @doing, and if node->real_node->right
+		 * already was in @doing, value of node->real_node->right
+		 * couldn't change, because node cannot be inserted between
+		 * locked neighbors.
+		 */
+		if (find_carry_node(doing,
+				    reiser4_carry_real(node)->right) != NULL) {
+			read_unlock_tree(tree);
+			/*
+			 * What we are doing here (this is also applicable to
+			 * the find_left_neighbor()).
+			 *
+			 * tree_walk.c code requires that insertion of a
+			 * pointer to a child, modification of parent pointer
+			 * in the child, and insertion of the child into
+			 * sibling list are atomic (see
+			 * plugin/item/internal.c:create_hook_internal()).
+			 *
+			 * carry allocates new node long before pointer to it
+			 * is inserted into parent and, actually, long before
+			 * parent is even known. Such allocated-but-orphaned
+			 * nodes are only trackable through carry level lists.
+			 *
+			 * Situation that is handled here is following: @node
+			 * has valid ->right pointer, but there is
+			 * allocated-but-orphaned node in the carry queue that
+			 * is logically between @node and @node->right. Here
+			 * we are searching for it. Critical point is that
+			 * this is only possible if @node->right is also in
+			 * the carry queue (this is checked above), because
+			 * this is the only way new orphaned node could be
+			 * inserted between them (before inserting new node,
+			 * make_space() first tries to shift to the right, so,
+			 * right neighbor will be locked and queued).
+			 *
+			 */
+			right = node;
+			do {
+				right = list_entry(right->header.level_linkage.next,
+						   carry_node, header.level_linkage);
+				assert("nikita-3408", !carry_node_end(doing,
+								      right));
+			} while (reiser4_carry_real(right) ==
+				 reiser4_carry_real(node));
+			return right;
+		}
+	}
+	read_unlock_tree(tree);
+
+	flags = GN_CAN_USE_UPPER_LEVELS;
+	if (!op->u.insert.flags & COPI_LOAD_RIGHT)
+		flags = GN_NO_ALLOC;
+
+	/* then, try to lock right neighbor */
+	init_lh(&lh);
+	result = reiser4_get_right_neighbor(&lh,
+					    reiser4_carry_real(node),
+					    ZNODE_WRITE_LOCK, flags);
+	if (result == 0) {
+		/* ok, node found and locked. */
+		right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
+		if (!IS_ERR(right)) {
+			right->node = lh.node;
+			move_lh(&right->lock_handle, &lh);
+			right->free = 1;
+			result = lock_carry_node_tail(right);
+			if (result != 0)
+				right = ERR_PTR(result);
+		}
+	} else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
+		/* node is rightmost node in a tree, or neighbor wasn't in
+		   cache, or there is an extent on the right. */
+		right = NULL;
+	} else
+		right = ERR_PTR(result);
+	done_lh(&lh);
+	return right;
+}
+
+/* how much free space in a @node is needed for @op
+
+   How much space in @node is required for completion of @op, where @op is
+   insert or paste operation.
+*/
+static unsigned int space_needed_for_op(znode * node	/* znode data are
+							 * inserted or
+							 * pasted in */ ,
+					carry_op * op	/* carry
+							   operation */ )
+{
+	assert("nikita-919", op != NULL);
+
+	switch (op->op) {
+	default:
+		impossible("nikita-1701", "Wrong opcode");
+	case COP_INSERT:
+		return space_needed(node, NULL, op->u.insert.d->data, 1);
+	case COP_PASTE:
+		return space_needed(node, op->u.insert.d->coord,
+				    op->u.insert.d->data, 0);
+	}
+}
+
+/* how much space in @node is required to insert or paste @data at
+   @coord. */
+unsigned int space_needed(const znode * node	/* node data are inserted or
+						 * pasted in */ ,
+			  const coord_t * coord	/* coord where data are
+						 * inserted or pasted
+						 * at */ ,
+			  const reiser4_item_data * data	/* data to insert or
+								 * paste */ ,
+			  int insertion /* non-0 is inserting, 0---paste */ )
+{
+	int result;
+	item_plugin *iplug;
+
+	assert("nikita-917", node != NULL);
+	assert("nikita-918", node_plugin_by_node(node) != NULL);
+	assert("vs-230", !insertion || (coord == NULL));
+
+	result = 0;
+	iplug = data->iplug;
+	if (iplug->b.estimate != NULL) {
+		/* ask item plugin how much space is needed to insert this
+		   item */
+		result += iplug->b.estimate(insertion ? NULL : coord, data);
+	} else {
+		/* reasonable default */
+		result += data->length;
+	}
+	if (insertion) {
+		node_plugin *nplug;
+
+		nplug = node->nplug;
+		/* and add node overhead */
+		if (nplug->item_overhead != NULL) {
+			result += nplug->item_overhead(node, NULL);
+		}
+	}
+	return result;
+}
+
+/* find &coord in parent where pointer to new child is to be stored. */
+static int find_new_child_coord(carry_op * op	/* COP_INSERT carry operation to
+						 * insert pointer to new
+						 * child */ )
+{
+	int result;
+	znode *node;
+	znode *child;
+
+	assert("nikita-941", op != NULL);
+	assert("nikita-942", op->op == COP_INSERT);
+
+	node = reiser4_carry_real(op->node);
+	assert("nikita-943", node != NULL);
+	assert("nikita-944", node_plugin_by_node(node) != NULL);
+
+	child = reiser4_carry_real(op->u.insert.child);
+	result =
+	    find_new_child_ptr(node, child, op->u.insert.brother,
+			       op->u.insert.d->coord);
+
+	build_child_ptr_data(child, op->u.insert.d->data);
+	return result;
+}
+
+/* additional amount of free space in @node required to complete @op */
+static int free_space_shortage(znode * node /* node to check */ ,
+			       carry_op * op /* operation being performed */ )
+{
+	assert("nikita-1061", node != NULL);
+	assert("nikita-1062", op != NULL);
+
+	switch (op->op) {
+	default:
+		impossible("nikita-1702", "Wrong opcode");
+	case COP_INSERT:
+	case COP_PASTE:
+		return space_needed_for_op(node, op) - znode_free_space(node);
+	case COP_EXTENT:
+		/* when inserting extent shift data around until insertion
+		   point is utmost in the node. */
+		if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
+			return +1;
+		else
+			return -1;
+	}
+}
+
+/* helper function: update node pointer in operation after insertion
+   point was probably shifted into @target. */
+static znode *sync_op(carry_op * op, carry_node * target)
+{
+	znode *insertion_node;
+
+	/* reget node from coord: shift might move insertion coord to
+	   the neighbor */
+	insertion_node = op->u.insert.d->coord->node;
+	/* if insertion point was actually moved into new node,
+	   update carry node pointer in operation. */
+	if (insertion_node != reiser4_carry_real(op->node)) {
+		op->node = target;
+		assert("nikita-2540",
+		       reiser4_carry_real(target) == insertion_node);
+	}
+	assert("nikita-2541",
+	       reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
+	return insertion_node;
+}
+
+/*
+ * complete make_space() call: update tracked lock handle if necessary. See
+ * comments for fs/reiser4/carry.h:carry_track_type
+ */
+static int
+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
+{
+	int result;
+	carry_track_type tracking;
+	znode *node;
+
+	tracking = doing->track_type;
+	node = op->u.insert.d->coord->node;
+
+	if (tracking == CARRY_TRACK_NODE ||
+	    (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
+		/* inserting or pasting into node different from
+		   original. Update lock handle supplied by caller. */
+		assert("nikita-1417", doing->tracked != NULL);
+		done_lh(doing->tracked);
+		init_lh(doing->tracked);
+		result = longterm_lock_znode(doing->tracked, node,
+					     ZNODE_WRITE_LOCK,
+					     ZNODE_LOCK_HIPRI);
+	} else
+		result = 0;
+	return result;
+}
+
+/* This is insertion policy function. It shifts data to the left and right
+   neighbors of insertion coord and allocates new nodes until there is enough
+   free space to complete @op.
+
+   See comments in the body.
+
+   Assumes that the node format favors insertions at the right end of the node
+   as node40 does.
+
+   See carry_flow() on detail about flow insertion
+*/
+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
+		      carry_level * doing /* current carry queue */ ,
+		      carry_level * todo /* carry queue on the parent level */ )
+{
+	znode *node;
+	int result;
+	int not_enough_space;
+	int blk_alloc;
+	znode *orig_node;
+	__u32 flags;
+
+	coord_t *coord;
+
+	assert("nikita-890", op != NULL);
+	assert("nikita-891", todo != NULL);
+	assert("nikita-892",
+	       op->op == COP_INSERT ||
+	       op->op == COP_PASTE || op->op == COP_EXTENT);
+	assert("nikita-1607",
+	       reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
+
+	flags = op->u.insert.flags;
+
+	/* NOTE check that new node can only be allocated after checking left
+	 * and right neighbors. This is necessary for proper work of
+	 * find_{left,right}_neighbor(). */
+	assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
+				   flags & COPI_DONT_SHIFT_LEFT));
+	assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
+				   flags & COPI_DONT_SHIFT_RIGHT));
+
+	coord = op->u.insert.d->coord;
+	orig_node = node = coord->node;
+
+	assert("nikita-908", node != NULL);
+	assert("nikita-909", node_plugin_by_node(node) != NULL);
+
+	result = 0;
+	/* If there is not enough space in a node, try to shift something to
+	   the left neighbor. This is a bit tricky, as locking to the left is
+	   low priority. This is handled by restart logic in carry().
+	 */
+	not_enough_space = free_space_shortage(node, op);
+	if (not_enough_space <= 0)
+		/* it is possible that carry was called when there actually
+		   was enough space in the node. For example, when inserting
+		   leftmost item so that delimiting keys have to be updated.
+		 */
+		return make_space_tail(op, doing, orig_node);
+	if (!(flags & COPI_DONT_SHIFT_LEFT)) {
+		carry_node *left;
+		/* make note in statistics of an attempt to move
+		   something into the left neighbor */
+		left = find_left_neighbor(op, doing);
+		if (unlikely(IS_ERR(left))) {
+			if (PTR_ERR(left) == -E_REPEAT)
+				return -E_REPEAT;
+			else {
+				/* some error other than restart request
+				   occurred. This shouldn't happen. Issue a
+				   warning and continue as if left neighbor
+				   weren't existing.
+				 */
+				warning("nikita-924",
+					"Error accessing left neighbor: %li",
+					PTR_ERR(left));
+			}
+		} else if (left != NULL) {
+
+			/* shift everything possible on the left of and
+			   including insertion coord into the left neighbor */
+			result = carry_shift_data(LEFT_SIDE, coord,
+						  reiser4_carry_real(left),
+						  doing, todo,
+						  flags & COPI_GO_LEFT);
+
+			/* reget node from coord: shift_left() might move
+			   insertion coord to the left neighbor */
+			node = sync_op(op, left);
+
+			not_enough_space = free_space_shortage(node, op);
+			/* There is not enough free space in @node, but
+			   may be, there is enough free space in
+			   @left. Various balancing decisions are valid here.
+			   The same for the shifiting to the right.
+			 */
+		}
+	}
+	/* If there still is not enough space, shift to the right */
+	if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
+		carry_node *right;
+
+		right = find_right_neighbor(op, doing);
+		if (IS_ERR(right)) {
+			warning("nikita-1065",
+				"Error accessing right neighbor: %li",
+				PTR_ERR(right));
+		} else if (right != NULL) {
+			/* node containing insertion point, and its right
+			   neighbor node are write locked by now.
+
+			   shift everything possible on the right of but
+			   excluding insertion coord into the right neighbor
+			 */
+			result = carry_shift_data(RIGHT_SIDE, coord,
+						  reiser4_carry_real(right),
+						  doing, todo,
+						  flags & COPI_GO_RIGHT);
+			/* reget node from coord: shift_right() might move
+			   insertion coord to the right neighbor */
+			node = sync_op(op, right);
+			not_enough_space = free_space_shortage(node, op);
+		}
+	}
+	/* If there is still not enough space, allocate new node(s).
+
+	   We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
+	   the carry operation flags (currently this is needed during flush
+	   only).
+	 */
+	for (blk_alloc = 0;
+	     not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
+	     !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
+		carry_node *fresh;	/* new node we are allocating */
+		coord_t coord_shadow;	/* remembered insertion point before
+					 * shifting data into new node */
+		carry_node *node_shadow;	/* remembered insertion node before
+						 * shifting */
+		unsigned int gointo;	/* whether insertion point should move
+					 * into newly allocated node */
+
+		/* allocate new node on the right of @node. Znode and disk
+		   fake block number for new node are allocated.
+
+		   add_new_znode() posts carry operation COP_INSERT with
+		   COPT_CHILD option to the parent level to add
+		   pointer to newly created node to its parent.
+
+		   Subtle point: if several new nodes are required to complete
+		   insertion operation at this level, they will be inserted
+		   into their parents in the order of creation, which means
+		   that @node will be valid "cookie" at the time of insertion.
+
+		 */
+		fresh = add_new_znode(node, op->node, doing, todo);
+		if (IS_ERR(fresh))
+			return PTR_ERR(fresh);
+
+		/* Try to shift into new node. */
+		result = lock_carry_node(doing, fresh);
+		zput(reiser4_carry_real(fresh));
+		if (result != 0) {
+			warning("nikita-947",
+				"Cannot lock new node: %i", result);
+			return result;
+		}
+
+		/* both nodes are write locked by now.
+
+		   shift everything possible on the right of and
+		   including insertion coord into the right neighbor.
+		 */
+		coord_dup(&coord_shadow, op->u.insert.d->coord);
+		node_shadow = op->node;
+		/* move insertion point into newly created node if:
+
+		   . insertion point is rightmost in the source node, or
+		   . this is not the first node we are allocating in a row.
+		 */
+		gointo =
+		    (blk_alloc > 0) ||
+		    coord_is_after_rightmost(op->u.insert.d->coord);
+
+		if (gointo &&
+		    op->op == COP_PASTE &&
+		    coord_is_existing_item(op->u.insert.d->coord) &&
+		    is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
+			/* paste into solid (atomic) item, which can contain
+			   only one unit, so we need to shift it right, where
+			   insertion point supposed to be */
+
+			assert("edward-1444", op->u.insert.d->data->iplug ==
+			       item_plugin_by_id(STATIC_STAT_DATA_ID));
+			assert("edward-1445",
+			       op->u.insert.d->data->length >
+			       node_plugin_by_node(coord->node)->free_space
+			       (coord->node));
+
+			op->u.insert.d->coord->between = BEFORE_UNIT;
+		}
+
+		result = carry_shift_data(RIGHT_SIDE, coord,
+					  reiser4_carry_real(fresh),
+					  doing, todo, gointo);
+		/* if insertion point was actually moved into new node,
+		   update carry node pointer in operation. */
+		node = sync_op(op, fresh);
+		not_enough_space = free_space_shortage(node, op);
+		if ((not_enough_space > 0) && (node != coord_shadow.node)) {
+			/* there is not enough free in new node. Shift
+			   insertion point back to the @shadow_node so that
+			   next new node would be inserted between
+			   @shadow_node and @fresh.
+			 */
+			coord_normalize(&coord_shadow);
+			coord_dup(coord, &coord_shadow);
+			node = coord->node;
+			op->node = node_shadow;
+			if (1 || (flags & COPI_STEP_BACK)) {
+				/* still not enough space?! Maybe there is
+				   enough space in the source node (i.e., node
+				   data are moved from) now.
+				 */
+				not_enough_space =
+				    free_space_shortage(node, op);
+			}
+		}
+	}
+	if (not_enough_space > 0) {
+		if (!(flags & COPI_DONT_ALLOCATE))
+			warning("nikita-948", "Cannot insert new item");
+		result = -E_NODE_FULL;
+	}
+	assert("nikita-1622", ergo(result == 0,
+				  reiser4_carry_real(op->node) == coord->node));
+	assert("nikita-2616", coord == op->u.insert.d->coord);
+	if (result == 0)
+		result = make_space_tail(op, doing, orig_node);
+	return result;
+}
+
+/* insert_paste_common() - common part of insert and paste operations
+
+   This function performs common part of COP_INSERT and COP_PASTE.
+
+   There are two ways in which insertion/paste can be requested:
+
+    . by directly supplying reiser4_item_data. In this case, op ->
+    u.insert.type is set to COPT_ITEM_DATA.
+
+    . by supplying child pointer to which is to inserted into parent. In this
+    case op -> u.insert.type == COPT_CHILD.
+
+    . by supplying key of new item/unit. This is currently only used during
+    extent insertion
+
+   This is required, because when new node is allocated we don't know at what
+   position pointer to it is to be stored in the parent. Actually, we don't
+   even know what its parent will be, because parent can be re-balanced
+   concurrently and new node re-parented, and because parent can be full and
+   pointer to the new node will go into some other node.
+
+   insert_paste_common() resolves pointer to child node into position in the
+   parent by calling find_new_child_coord(), that fills
+   reiser4_item_data. After this, insertion/paste proceeds uniformly.
+
+   Another complication is with finding free space during pasting. It may
+   happen that while shifting items to the neighbors and newly allocated
+   nodes, insertion coord can no longer be in the item we wanted to paste
+   into. At this point, paste becomes (morphs) into insert. Moreover free
+   space analysis has to be repeated, because amount of space required for
+   insertion is different from that of paste (item header overhead, etc).
+
+   This function "unifies" different insertion modes (by resolving child
+   pointer or key into insertion coord), and then calls make_space() to free
+   enough space in the node by shifting data to the left and right and by
+   allocating new nodes if necessary. Carry operation knows amount of space
+   required for its completion. After enough free space is obtained, caller of
+   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
+   by calling item plugin method.
+
+*/
+static int insert_paste_common(carry_op * op	/* carry operation being
+						 * performed */ ,
+			       carry_level * doing /* current carry level */ ,
+			       carry_level * todo /* next carry level */ ,
+			       carry_insert_data * cdata	/* pointer to
+								 * cdata */ ,
+			       coord_t * coord /* insertion/paste coord */ ,
+			       reiser4_item_data * data	/* data to be
+							 * inserted/pasted */ )
+{
+	assert("nikita-981", op != NULL);
+	assert("nikita-980", todo != NULL);
+	assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
+	       || (op->op == COP_EXTENT));
+
+	if (op->u.insert.type == COPT_PASTE_RESTARTED) {
+		/* nothing to do. Fall through to make_space(). */
+		;
+	} else if (op->u.insert.type == COPT_KEY) {
+		node_search_result intra_node;
+		znode *node;
+		/* Problem with doing batching at the lowest level, is that
+		   operations here are given by coords where modification is
+		   to be performed, and one modification can invalidate coords
+		   of all following operations.
+
+		   So, we are implementing yet another type for operation that
+		   will use (the only) "locator" stable across shifting of
+		   data between nodes, etc.: key (COPT_KEY).
+
+		   This clause resolves key to the coord in the node.
+
+		   But node can change also. Probably some pieces have to be
+		   added to the lock_carry_node(), to lock node by its key.
+
+		 */
+		/* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
+		   if you need something else. */
+		op->u.insert.d->coord = coord;
+		node = reiser4_carry_real(op->node);
+		intra_node = node_plugin_by_node(node)->lookup
+		    (node, op->u.insert.d->key, FIND_EXACT,
+		     op->u.insert.d->coord);
+		if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
+			warning("nikita-1715", "Intra node lookup failure: %i",
+				intra_node);
+			return intra_node;
+		}
+	} else if (op->u.insert.type == COPT_CHILD) {
+		/* if we are asked to insert pointer to the child into
+		   internal node, first convert pointer to the child into
+		   coord within parent node.
+		 */
+		znode *child;
+		int result;
+
+		op->u.insert.d = cdata;
+		op->u.insert.d->coord = coord;
+		op->u.insert.d->data = data;
+		op->u.insert.d->coord->node = reiser4_carry_real(op->node);
+		result = find_new_child_coord(op);
+		child = reiser4_carry_real(op->u.insert.child);
+		if (result != NS_NOT_FOUND) {
+			warning("nikita-993",
+				"Cannot find a place for child pointer: %i",
+				result);
+			return result;
+		}
+		/* This only happens when we did multiple insertions at
+		   the previous level, trying to insert single item and
+		   it so happened, that insertion of pointers to all new
+		   nodes before this one already caused parent node to
+		   split (may be several times).
+
+		   I am going to come up with better solution.
+
+		   You are not expected to understand this.
+		   -- v6root/usr/sys/ken/slp.c
+
+		   Basically, what happens here is the following: carry came
+		   to the parent level and is about to insert internal item
+		   pointing to the child node that it just inserted in the
+		   level below. Position where internal item is to be inserted
+		   was found by find_new_child_coord() above, but node of the
+		   current carry operation (that is, parent node of child
+		   inserted on the previous level), was determined earlier in
+		   the lock_carry_level/lock_carry_node. It could so happen
+		   that other carry operations already performed on the parent
+		   level already split parent node, so that insertion point
+		   moved into another node. Handle this by creating new carry
+		   node for insertion point if necessary.
+		 */
+		if (reiser4_carry_real(op->node) !=
+		    op->u.insert.d->coord->node) {
+			pool_ordering direction;
+			znode *z1;
+			znode *z2;
+			reiser4_key k1;
+			reiser4_key k2;
+
+			/*
+			 * determine in what direction insertion point
+			 * moved. Do this by comparing delimiting keys.
+			 */
+			z1 = op->u.insert.d->coord->node;
+			z2 = reiser4_carry_real(op->node);
+			if (keyle(leftmost_key_in_node(z1, &k1),
+				  leftmost_key_in_node(z2, &k2)))
+				/* insertion point moved to the left */
+				direction = POOLO_BEFORE;
+			else
+				/* insertion point moved to the right */
+				direction = POOLO_AFTER;
+
+			op->node = reiser4_add_carry_skip(doing,
+							  direction, op->node);
+			if (IS_ERR(op->node))
+				return PTR_ERR(op->node);
+			op->node->node = op->u.insert.d->coord->node;
+			op->node->free = 1;
+			result = lock_carry_node(doing, op->node);
+			if (result != 0)
+				return result;
+		}
+
+		/*
+		 * set up key of an item being inserted: we are inserting
+		 * internal item and its key is (by the very definition of
+		 * search tree) is leftmost key in the child node.
+		 */
+		write_lock_dk(znode_get_tree(child));
+		op->u.insert.d->key = leftmost_key_in_node(child,
+							   znode_get_ld_key(child));
+		write_unlock_dk(znode_get_tree(child));
+		op->u.insert.d->data->arg = op->u.insert.brother;
+	} else {
+		assert("vs-243", op->u.insert.d->coord != NULL);
+		op->u.insert.d->coord->node = reiser4_carry_real(op->node);
+	}
+
+	/* find free space. */
+	return make_space(op, doing, todo);
+}
+
+/* handle carry COP_INSERT operation.
+
+   Insert new item into node. New item can be given in one of two ways:
+
+   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
+   only applicable at the leaf/twig level.
+
+   - by passing a child node pointer to which is to be inserted by this
+   operation.
+
+*/
+static int carry_insert(carry_op * op /* operation to perform */ ,
+			carry_level * doing	/* queue of operations @op
+						 * is part of */ ,
+			carry_level * todo	/* queue where new operations
+						 * are accumulated */ )
+{
+	znode *node;
+	carry_insert_data cdata;
+	coord_t coord;
+	reiser4_item_data data;
+	carry_plugin_info info;
+	int result;
+
+	assert("nikita-1036", op != NULL);
+	assert("nikita-1037", todo != NULL);
+	assert("nikita-1038", op->op == COP_INSERT);
+
+	coord_init_zero(&coord);
+
+	/* perform common functionality of insert and paste. */
+	result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
+	if (result != 0)
+		return result;
+
+	node = op->u.insert.d->coord->node;
+	assert("nikita-1039", node != NULL);
+	assert("nikita-1040", node_plugin_by_node(node) != NULL);
+
+	assert("nikita-949",
+	       space_needed_for_op(node, op) <= znode_free_space(node));
+
+	/* ask node layout to create new item. */
+	info.doing = doing;
+	info.todo = todo;
+	result = node_plugin_by_node(node)->create_item
+	    (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
+	     &info);
+	doing->restartable = 0;
+	znode_make_dirty(node);
+
+	return result;
+}
+
+/*
+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
+ * by slicing into multiple items.
+ */
+
+#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
+#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
+#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
+
+static size_t item_data_overhead(carry_op * op)
+{
+	if (flow_insert_data(op)->iplug->b.estimate == NULL)
+		return 0;
+	return (flow_insert_data(op)->iplug->b.
+		estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
+		flow_insert_data(op)->length);
+}
+
+/* FIXME-VS: this is called several times during one make_flow_for_insertion
+   and it will always return the same result. Some optimization could be made
+   by calculating this value once at the beginning and passing it around. That
+   would reduce some flexibility in future changes
+*/
+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
+static size_t flow_insertion_overhead(carry_op * op)
+{
+	znode *node;
+	size_t insertion_overhead;
+
+	node = flow_insert_point(op)->node;
+	insertion_overhead = 0;
+	if (node->nplug->item_overhead &&
+	    !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
+		       flow_insert_data(op)))
+		insertion_overhead =
+		    node->nplug->item_overhead(node, NULL) +
+			item_data_overhead(op);
+	return insertion_overhead;
+}
+
+/* how many bytes of flow does fit to the node */
+static int what_can_fit_into_node(carry_op * op)
+{
+	size_t free, overhead;
+
+	overhead = flow_insertion_overhead(op);
+	free = znode_free_space(flow_insert_point(op)->node);
+	if (free <= overhead)
+		return 0;
+	free -= overhead;
+	/* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
+	if (free < op->u.insert_flow.flow->length)
+		return free;
+	return (int)op->u.insert_flow.flow->length;
+}
+
+/* in make_space_for_flow_insertion we need to check either whether whole flow
+   fits into a node or whether minimal fraction of flow fits into a node */
+static int enough_space_for_whole_flow(carry_op * op)
+{
+	return (unsigned)what_can_fit_into_node(op) ==
+	    op->u.insert_flow.flow->length;
+}
+
+#define MIN_FLOW_FRACTION 1
+static int enough_space_for_min_flow_fraction(carry_op * op)
+{
+	assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
+
+	return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
+}
+
+/* this returns 0 if left neighbor was obtained successfully and everything
+   upto insertion point including it were shifted and left neighbor still has
+   some free space to put minimal fraction of flow into it */
+static int
+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	carry_node *left;
+	znode *orig;
+
+	left = find_left_neighbor(op, doing);
+	if (unlikely(IS_ERR(left))) {
+		warning("vs-899",
+			"make_space_by_shift_left: "
+			"error accessing left neighbor: %li", PTR_ERR(left));
+		return 1;
+	}
+	if (left == NULL)
+		/* left neighbor either does not exist or is unformatted
+		   node */
+		return 1;
+
+	orig = flow_insert_point(op)->node;
+	/* try to shift content of node @orig from its head upto insert point
+	   including insertion point into the left neighbor */
+	carry_shift_data(LEFT_SIDE, flow_insert_point(op),
+			 reiser4_carry_real(left), doing, todo,
+			 1 /* including insert point */);
+	if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
+		/* insertion point did not move */
+		return 1;
+	}
+
+	/* insertion point is set after last item in the node */
+	assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
+
+	if (!enough_space_for_min_flow_fraction(op)) {
+		/* insertion point node does not have enough free space to put
+		   even minimal portion of flow into it, therefore, move
+		   insertion point back to orig node (before first item) */
+		coord_init_before_first_item(flow_insert_point(op), orig);
+		return 1;
+	}
+
+	/* part of flow is to be written to the end of node */
+	op->node = left;
+	return 0;
+}
+
+/* this returns 0 if right neighbor was obtained successfully and everything to
+   the right of insertion point was shifted to it and node got enough free
+   space to put minimal fraction of flow into it */
+static int
+make_space_by_shift_right(carry_op * op, carry_level * doing,
+			  carry_level * todo)
+{
+	carry_node *right;
+
+	right = find_right_neighbor(op, doing);
+	if (unlikely(IS_ERR(right))) {
+		warning("nikita-1065", "shift_right_excluding_insert_point: "
+			"error accessing right neighbor: %li", PTR_ERR(right));
+		return 1;
+	}
+	if (right) {
+		/* shift everything possible on the right of but excluding
+		   insertion coord into the right neighbor */
+		carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
+				 reiser4_carry_real(right), doing, todo,
+				 0 /* not including insert point */);
+	} else {
+		/* right neighbor either does not exist or is unformatted
+		   node */
+		;
+	}
+	if (coord_is_after_rightmost(flow_insert_point(op))) {
+		if (enough_space_for_min_flow_fraction(op)) {
+			/* part of flow is to be written to the end of node */
+			return 0;
+		}
+	}
+
+	/* new node is to be added if insert point node did not get enough
+	   space for whole flow */
+	return 1;
+}
+
+/* this returns 0 when insert coord is set at the node end and fraction of flow
+   fits into that node */
+static int
+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	int result;
+	znode *node;
+	carry_node *new;
+
+	node = flow_insert_point(op)->node;
+
+	if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
+		return RETERR(-E_NODE_FULL);
+	/* add new node after insert point node */
+	new = add_new_znode(node, op->node, doing, todo);
+	if (unlikely(IS_ERR(new))) {
+		return PTR_ERR(new);
+	}
+	result = lock_carry_node(doing, new);
+	zput(reiser4_carry_real(new));
+	if (unlikely(result)) {
+		return result;
+	}
+	op->u.insert_flow.new_nodes++;
+	if (!coord_is_after_rightmost(flow_insert_point(op))) {
+		carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
+				 reiser4_carry_real(new), doing, todo,
+				 0 /* not including insert point */);
+		assert("vs-901",
+		       coord_is_after_rightmost(flow_insert_point(op)));
+
+		if (enough_space_for_min_flow_fraction(op)) {
+			return 0;
+		}
+		if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
+			return RETERR(-E_NODE_FULL);
+
+		/* add one more new node */
+		new = add_new_znode(node, op->node, doing, todo);
+		if (unlikely(IS_ERR(new))) {
+			return PTR_ERR(new);
+		}
+		result = lock_carry_node(doing, new);
+		zput(reiser4_carry_real(new));
+		if (unlikely(result)) {
+			return result;
+		}
+		op->u.insert_flow.new_nodes++;
+	}
+
+	/* move insertion point to new node */
+	coord_init_before_first_item(flow_insert_point(op),
+				     reiser4_carry_real(new));
+	op->node = new;
+	return 0;
+}
+
+static int
+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
+			      carry_level * todo)
+{
+	__u32 flags = op->u.insert_flow.flags;
+
+	if (enough_space_for_whole_flow(op)) {
+		/* whole flow fits into insert point node */
+		return 0;
+	}
+
+	if (!(flags & COPI_DONT_SHIFT_LEFT)
+	    && (make_space_by_shift_left(op, doing, todo) == 0)) {
+		/* insert point is shifted to left neighbor of original insert
+		   point node and is set after last unit in that node. It has
+		   enough space to fit at least minimal fraction of flow. */
+		return 0;
+	}
+
+	if (enough_space_for_whole_flow(op)) {
+		/* whole flow fits into insert point node */
+		return 0;
+	}
+
+	if (!(flags & COPI_DONT_SHIFT_RIGHT)
+	    && (make_space_by_shift_right(op, doing, todo) == 0)) {
+		/* insert point is still set to the same node, but there is
+		   nothing to the right of insert point. */
+		return 0;
+	}
+
+	if (enough_space_for_whole_flow(op)) {
+		/* whole flow fits into insert point node */
+		return 0;
+	}
+
+	return make_space_by_new_nodes(op, doing, todo);
+}
+
+/* implements COP_INSERT_FLOW operation */
+static int
+carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
+{
+	int result;
+	flow_t *f;
+	coord_t *insert_point;
+	node_plugin *nplug;
+	carry_plugin_info info;
+	znode *orig_node;
+	lock_handle *orig_lh;
+
+	f = op->u.insert_flow.flow;
+	result = 0;
+
+	/* carry system needs this to work */
+	info.doing = doing;
+	info.todo = todo;
+
+	orig_node = flow_insert_point(op)->node;
+	orig_lh = doing->tracked;
+
+	while (f->length) {
+		result = make_space_for_flow_insertion(op, doing, todo);
+		if (result)
+			break;
+
+		insert_point = flow_insert_point(op);
+		nplug = node_plugin_by_node(insert_point->node);
+
+		/* compose item data for insertion/pasting */
+		flow_insert_data(op)->data = f->data;
+		flow_insert_data(op)->length = what_can_fit_into_node(op);
+
+		if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
+			/* insert point is set to item of file we are writing to and we have to append to it */
+			assert("vs-903", insert_point->between == AFTER_UNIT);
+			nplug->change_item_size(insert_point,
+						flow_insert_data(op)->length);
+			flow_insert_data(op)->iplug->b.paste(insert_point,
+							     flow_insert_data
+							     (op), &info);
+		} else {
+			/* new item must be inserted */
+			pos_in_node_t new_pos;
+			flow_insert_data(op)->length += item_data_overhead(op);
+
+			/* FIXME-VS: this is because node40_create_item changes
+			   insert_point for obscure reasons */
+			switch (insert_point->between) {
+			case AFTER_ITEM:
+				new_pos = insert_point->item_pos + 1;
+				break;
+			case EMPTY_NODE:
+				new_pos = 0;
+				break;
+			case BEFORE_ITEM:
+				assert("vs-905", insert_point->item_pos == 0);
+				new_pos = 0;
+				break;
+			default:
+				impossible("vs-906",
+					   "carry_insert_flow: invalid coord");
+				new_pos = 0;
+				break;
+			}
+
+			nplug->create_item(insert_point, &f->key,
+					   flow_insert_data(op), &info);
+			coord_set_item_pos(insert_point, new_pos);
+		}
+		coord_init_after_item_end(insert_point);
+		doing->restartable = 0;
+		znode_make_dirty(insert_point->node);
+
+		move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
+	}
+
+	if (orig_node != flow_insert_point(op)->node) {
+		/* move lock to new insert point */
+		done_lh(orig_lh);
+		init_lh(orig_lh);
+		result =
+		    longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
+					ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+	}
+
+	return result;
+}
+
+/* implements COP_DELETE operation
+
+   Remove pointer to @op -> u.delete.child from it's parent.
+
+   This function also handles killing of a tree root is last pointer from it
+   was removed. This is complicated by our handling of "twig" level: root on
+   twig level is never killed.
+
+*/
+static int carry_delete(carry_op * op /* operation to be performed */ ,
+			carry_level * doing UNUSED_ARG	/* current carry
+							 * level */ ,
+			carry_level * todo /* next carry level */ )
+{
+	int result;
+	coord_t coord;
+	coord_t coord2;
+	znode *parent;
+	znode *child;
+	carry_plugin_info info;
+	reiser4_tree *tree;
+
+	/*
+	 * This operation is called to delete internal item pointing to the
+	 * child node that was removed by carry from the tree on the previous
+	 * tree level.
+	 */
+
+	assert("nikita-893", op != NULL);
+	assert("nikita-894", todo != NULL);
+	assert("nikita-895", op->op == COP_DELETE);
+
+	coord_init_zero(&coord);
+	coord_init_zero(&coord2);
+
+	parent = reiser4_carry_real(op->node);
+	child = op->u.delete.child ?
+		reiser4_carry_real(op->u.delete.child) : op->node->node;
+	tree = znode_get_tree(child);
+	read_lock_tree(tree);
+
+	/*
+	 * @parent was determined when carry entered parent level
+	 * (lock_carry_level/lock_carry_node). Since then, actual parent of
+	 * @child node could change due to other carry operations performed on
+	 * the parent level. Check for this.
+	 */
+
+	if (znode_parent(child) != parent) {
+		/* NOTE-NIKITA add stat counter for this. */
+		parent = znode_parent(child);
+		assert("nikita-2581", find_carry_node(doing, parent));
+	}
+	read_unlock_tree(tree);
+
+	assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
+
+	/* Twig level horrors: tree should be of height at least 2. So, last
+	   pointer from the root at twig level is preserved even if child is
+	   empty. This is ugly, but so it was architectured.
+	 */
+
+	if (znode_is_root(parent) &&
+	    znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
+	    node_num_items(parent) == 1) {
+		/* Delimiting key manipulations. */
+		write_lock_dk(tree);
+		znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
+		znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
+		ZF_SET(child, JNODE_DKSET);
+		write_unlock_dk(tree);
+
+		/* @child escaped imminent death! */
+		ZF_CLR(child, JNODE_HEARD_BANSHEE);
+		return 0;
+	}
+
+	/* convert child pointer to the coord_t */
+	result = find_child_ptr(parent, child, &coord);
+	if (result != NS_FOUND) {
+		warning("nikita-994", "Cannot find child pointer: %i", result);
+		print_coord_content("coord", &coord);
+		return result;
+	}
+
+	coord_dup(&coord2, &coord);
+	info.doing = doing;
+	info.todo = todo;
+	{
+		/*
+		 * Actually kill internal item: prepare structure with
+		 * arguments for ->cut_and_kill() method...
+		 */
+
+		struct carry_kill_data kdata;
+		kdata.params.from = &coord;
+		kdata.params.to = &coord2;
+		kdata.params.from_key = NULL;
+		kdata.params.to_key = NULL;
+		kdata.params.smallest_removed = NULL;
+		kdata.params.truncate = 1;
+		kdata.flags = op->u.delete.flags;
+		kdata.inode = NULL;
+		kdata.left = NULL;
+		kdata.right = NULL;
+		kdata.buf = NULL;
+		/* ... and call it. */
+		result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
+								   &info);
+	}
+	doing->restartable = 0;
+
+	/* check whether root should be killed violently */
+	if (znode_is_root(parent) &&
+	    /* don't kill roots at and lower than twig level */
+	    znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
+	    node_num_items(parent) == 1) {
+		result = reiser4_kill_tree_root(coord.node);
+	}
+
+	return result < 0 ? : 0;
+}
+
+/* implements COP_CUT opration
+
+   Cuts part or whole content of node.
+
+*/
+static int carry_cut(carry_op * op /* operation to be performed */ ,
+		     carry_level * doing /* current carry level */ ,
+		     carry_level * todo /* next carry level */ )
+{
+	int result;
+	carry_plugin_info info;
+	node_plugin *nplug;
+
+	assert("nikita-896", op != NULL);
+	assert("nikita-897", todo != NULL);
+	assert("nikita-898", op->op == COP_CUT);
+
+	info.doing = doing;
+	info.todo = todo;
+
+	nplug = node_plugin_by_node(reiser4_carry_real(op->node));
+	if (op->u.cut_or_kill.is_cut)
+		result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
+	else
+		result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
+
+	doing->restartable = 0;
+	return result < 0 ? : 0;
+}
+
+/* helper function for carry_paste(): returns true if @op can be continued as
+   paste  */
+static int
+can_paste(coord_t * icoord, const reiser4_key * key,
+	  const reiser4_item_data * data)
+{
+	coord_t circa;
+	item_plugin *new_iplug;
+	item_plugin *old_iplug;
+	int result = 0;		/* to keep gcc shut */
+
+	assert("", icoord->between != AT_UNIT);
+
+	/* obviously, one cannot paste when node is empty---there is nothing
+	   to paste into. */
+	if (node_is_empty(icoord->node))
+		return 0;
+	/* if insertion point is at the middle of the item, then paste */
+	if (!coord_is_between_items(icoord))
+		return 1;
+	coord_dup(&circa, icoord);
+	circa.between = AT_UNIT;
+
+	old_iplug = item_plugin_by_coord(&circa);
+	new_iplug = data->iplug;
+
+	/* check whether we can paste to the item @icoord is "at" when we
+	   ignore ->between field */
+	if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
+		result = 1;
+	} else if (icoord->between == BEFORE_UNIT
+		   || icoord->between == BEFORE_ITEM) {
+		/* otherwise, try to glue to the item at the left, if any */
+		coord_dup(&circa, icoord);
+		if (coord_set_to_left(&circa)) {
+			result = 0;
+			coord_init_before_item(icoord);
+		} else {
+			old_iplug = item_plugin_by_coord(&circa);
+			result = (old_iplug == new_iplug)
+			    && item_can_contain_key(icoord, key, data);
+			if (result) {
+				coord_dup(icoord, &circa);
+				icoord->between = AFTER_UNIT;
+			}
+		}
+	} else if (icoord->between == AFTER_UNIT
+		   || icoord->between == AFTER_ITEM) {
+		coord_dup(&circa, icoord);
+		/* otherwise, try to glue to the item at the right, if any */
+		if (coord_set_to_right(&circa)) {
+			result = 0;
+			coord_init_after_item(icoord);
+		} else {
+			int (*cck) (const coord_t *, const reiser4_key *,
+				    const reiser4_item_data *);
+
+			old_iplug = item_plugin_by_coord(&circa);
+
+			cck = old_iplug->b.can_contain_key;
+			if (cck == NULL)
+				/* item doesn't define ->can_contain_key
+				   method? So it is not expandable. */
+				result = 0;
+			else {
+				result = (old_iplug == new_iplug)
+				    && cck(&circa /*icoord */ , key, data);
+				if (result) {
+					coord_dup(icoord, &circa);
+					icoord->between = BEFORE_UNIT;
+				}
+			}
+		}
+	} else
+		impossible("nikita-2513", "Nothing works");
+	if (result) {
+		if (icoord->between == BEFORE_ITEM) {
+			assert("vs-912", icoord->unit_pos == 0);
+			icoord->between = BEFORE_UNIT;
+		} else if (icoord->between == AFTER_ITEM) {
+			coord_init_after_item_end(icoord);
+		}
+	}
+	return result;
+}
+
+/* implements COP_PASTE operation
+
+   Paste data into existing item. This is complicated by the fact that after
+   we shifted something to the left or right neighbors trying to free some
+   space, item we were supposed to paste into can be in different node than
+   insertion coord. If so, we are no longer doing paste, but insert. See
+   comments in insert_paste_common().
+
+*/
+static int carry_paste(carry_op * op /* operation to be performed */ ,
+		       carry_level * doing UNUSED_ARG	/* current carry
+							 * level */ ,
+		       carry_level * todo /* next carry level */ )
+{
+	znode *node;
+	carry_insert_data cdata;
+	coord_t dcoord;
+	reiser4_item_data data;
+	int result;
+	int real_size;
+	item_plugin *iplug;
+	carry_plugin_info info;
+	coord_t *coord;
+
+	assert("nikita-982", op != NULL);
+	assert("nikita-983", todo != NULL);
+	assert("nikita-984", op->op == COP_PASTE);
+
+	coord_init_zero(&dcoord);
+
+	result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
+	if (result != 0)
+		return result;
+
+	coord = op->u.insert.d->coord;
+
+	/* handle case when op -> u.insert.coord doesn't point to the item
+	   of required type. restart as insert. */
+	if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
+		op->op = COP_INSERT;
+		op->u.insert.type = COPT_PASTE_RESTARTED;
+		result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
+
+		return result;
+	}
+
+	node = coord->node;
+	iplug = item_plugin_by_coord(coord);
+	assert("nikita-992", iplug != NULL);
+
+	assert("nikita-985", node != NULL);
+	assert("nikita-986", node_plugin_by_node(node) != NULL);
+
+	assert("nikita-987",
+	       space_needed_for_op(node, op) <= znode_free_space(node));
+
+	assert("nikita-1286", coord_is_existing_item(coord));
+
+	/*
+	 * if item is expanded as a result of this operation, we should first
+	 * change item size, than call ->b.paste item method. If item is
+	 * shrunk, it should be done other way around: first call ->b.paste
+	 * method, then reduce item size.
+	 */
+
+	real_size = space_needed_for_op(node, op);
+	if (real_size > 0)
+		node->nplug->change_item_size(coord, real_size);
+
+	doing->restartable = 0;
+	info.doing = doing;
+	info.todo = todo;
+
+	result = iplug->b.paste(coord, op->u.insert.d->data, &info);
+
+	if (real_size < 0)
+		node->nplug->change_item_size(coord, real_size);
+
+	/* if we pasted at the beginning of the item, update item's key. */
+	if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
+		node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
+
+	znode_make_dirty(node);
+	return result;
+}
+
+/* handle carry COP_EXTENT operation. */
+static int carry_extent(carry_op * op /* operation to perform */ ,
+			carry_level * doing	/* queue of operations @op
+						 * is part of */ ,
+			carry_level * todo	/* queue where new operations
+						 * are accumulated */ )
+{
+	znode *node;
+	carry_insert_data cdata;
+	coord_t coord;
+	reiser4_item_data data;
+	carry_op *delete_dummy;
+	carry_op *insert_extent;
+	int result;
+	carry_plugin_info info;
+
+	assert("nikita-1751", op != NULL);
+	assert("nikita-1752", todo != NULL);
+	assert("nikita-1753", op->op == COP_EXTENT);
+
+	/* extent insertion overview:
+
+	   extents live on the TWIG LEVEL, which is level one above the leaf
+	   one. This complicates extent insertion logic somewhat: it may
+	   happen (and going to happen all the time) that in logical key
+	   ordering extent has to be placed between items I1 and I2, located
+	   at the leaf level, but I1 and I2 are in the same formatted leaf
+	   node N1. To insert extent one has to
+
+	   (1) reach node N1 and shift data between N1, its neighbors and
+	   possibly newly allocated nodes until I1 and I2 fall into different
+	   nodes. Since I1 and I2 are still neighboring items in logical key
+	   order, they will be necessary utmost items in their respective
+	   nodes.
+
+	   (2) After this new extent item is inserted into node on the twig
+	   level.
+
+	   Fortunately this process can reuse almost all code from standard
+	   insertion procedure (viz. make_space() and insert_paste_common()),
+	   due to the following observation: make_space() only shifts data up
+	   to and excluding or including insertion point. It never
+	   "over-moves" through insertion point. Thus, one can use
+	   make_space() to perform step (1). All required for this is just to
+	   instruct free_space_shortage() to keep make_space() shifting data
+	   until insertion point is at the node border.
+
+	 */
+
+	/* perform common functionality of insert and paste. */
+	result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
+	if (result != 0)
+		return result;
+
+	node = op->u.extent.d->coord->node;
+	assert("nikita-1754", node != NULL);
+	assert("nikita-1755", node_plugin_by_node(node) != NULL);
+	assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
+
+	/* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
+	   extent fits between items. */
+
+	info.doing = doing;
+	info.todo = todo;
+
+	/* there is another complication due to placement of extents on the
+	   twig level: extents are "rigid" in the sense that key-range
+	   occupied by extent cannot grow indefinitely to the right as it is
+	   for the formatted leaf nodes. Because of this when search finds two
+	   adjacent extents on the twig level, it has to "drill" to the leaf
+	   level, creating new node. Here we are removing this node.
+	 */
+	if (node_is_empty(node)) {
+		delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
+		if (IS_ERR(delete_dummy))
+			return PTR_ERR(delete_dummy);
+		delete_dummy->u.delete.child = NULL;
+		delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
+		ZF_SET(node, JNODE_HEARD_BANSHEE);
+	}
+
+	/* proceed with inserting extent item into parent. We are definitely
+	   inserting rather than pasting if we get that far. */
+	insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
+	if (IS_ERR(insert_extent))
+		/* @delete_dummy will be automatically destroyed on the level
+		   exiting  */
+		return PTR_ERR(insert_extent);
+	/* NOTE-NIKITA insertion by key is simplest option here. Another
+	   possibility is to insert on the left or right of already existing
+	   item.
+	 */
+	insert_extent->u.insert.type = COPT_KEY;
+	insert_extent->u.insert.d = op->u.extent.d;
+	assert("nikita-1719", op->u.extent.d->key != NULL);
+	insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
+	insert_extent->u.insert.flags =
+	    znode_get_tree(node)->carry.new_extent_flags;
+
+	/*
+	 * if carry was asked to track lock handle we should actually track
+	 * lock handle on the twig node rather than on the leaf where
+	 * operation was started from. Transfer tracked lock handle.
+	 */
+	if (doing->track_type) {
+		assert("nikita-3242", doing->tracked != NULL);
+		assert("nikita-3244", todo->tracked == NULL);
+		todo->tracked = doing->tracked;
+		todo->track_type = CARRY_TRACK_NODE;
+		doing->tracked = NULL;
+		doing->track_type = 0;
+	}
+
+	return 0;
+}
+
+/* update key in @parent between pointers to @left and @right.
+
+   Find coords of @left and @right and update delimiting key between them.
+   This is helper function called by carry_update(). Finds position of
+   internal item involved. Updates item key. Updates delimiting keys of child
+   nodes involved.
+*/
+static int update_delimiting_key(znode * parent	/* node key is updated
+						 * in */ ,
+				 znode * left /* child of @parent */ ,
+				 znode * right /* child of @parent */ ,
+				 carry_level * doing	/* current carry
+							 * level */ ,
+				 carry_level * todo	/* parent carry
+							 * level */ ,
+				 const char **error_msg	/* place to
+							 * store error
+							 * message */ )
+{
+	coord_t left_pos;
+	coord_t right_pos;
+	int result;
+	reiser4_key ldkey;
+	carry_plugin_info info;
+
+	assert("nikita-1177", right != NULL);
+	/* find position of right left child in a parent */
+	result = find_child_ptr(parent, right, &right_pos);
+	if (result != NS_FOUND) {
+		*error_msg = "Cannot find position of right child";
+		return result;
+	}
+
+	if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
+		/* find position of the left child in a parent */
+		result = find_child_ptr(parent, left, &left_pos);
+		if (result != NS_FOUND) {
+			*error_msg = "Cannot find position of left child";
+			return result;
+		}
+		assert("nikita-1355", left_pos.node != NULL);
+	} else
+		left_pos.node = NULL;
+
+	/* check that they are separated by exactly one key and are basically
+	   sane */
+	if (REISER4_DEBUG) {
+		if ((left_pos.node != NULL)
+		    && !coord_is_existing_unit(&left_pos)) {
+			*error_msg = "Left child is bastard";
+			return RETERR(-EIO);
+		}
+		if (!coord_is_existing_unit(&right_pos)) {
+			*error_msg = "Right child is bastard";
+			return RETERR(-EIO);
+		}
+		if (left_pos.node != NULL &&
+		    !coord_are_neighbors(&left_pos, &right_pos)) {
+			*error_msg = "Children are not direct siblings";
+			return RETERR(-EIO);
+		}
+	}
+	*error_msg = NULL;
+
+	info.doing = doing;
+	info.todo = todo;
+
+	/*
+	 * If child node is not empty, new key of internal item is a key of
+	 * leftmost item in the child node. If the child is empty, take its
+	 * right delimiting key as a new key of the internal item. Precise key
+	 * in the latter case is not important per se, because the child (and
+	 * the internal item) are going to be killed shortly anyway, but we
+	 * have to preserve correct order of keys in the parent node.
+	 */
+
+	if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
+		leftmost_key_in_node(right, &ldkey);
+	else {
+		read_lock_dk(znode_get_tree(parent));
+		ldkey = *znode_get_rd_key(right);
+		read_unlock_dk(znode_get_tree(parent));
+	}
+	node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
+	doing->restartable = 0;
+	znode_make_dirty(parent);
+	return 0;
+}
+
+/* implements COP_UPDATE opration
+
+   Update delimiting keys.
+
+*/
+static int carry_update(carry_op * op /* operation to be performed */ ,
+			carry_level * doing /* current carry level */ ,
+			carry_level * todo /* next carry level */ )
+{
+	int result;
+	carry_node *missing UNUSED_ARG;
+	znode *left;
+	znode *right;
+	carry_node *lchild;
+	carry_node *rchild;
+	const char *error_msg;
+	reiser4_tree *tree;
+
+	/*
+	 * This operation is called to update key of internal item. This is
+	 * necessary when carry shifted of cut data on the child
+	 * level. Arguments of this operation are:
+	 *
+	 *     @right --- child node. Operation should update key of internal
+	 *     item pointing to @right.
+	 *
+	 *     @left --- left neighbor of @right. This parameter is optional.
+	 */
+
+	assert("nikita-902", op != NULL);
+	assert("nikita-903", todo != NULL);
+	assert("nikita-904", op->op == COP_UPDATE);
+
+	lchild = op->u.update.left;
+	rchild = op->node;
+
+	if (lchild != NULL) {
+		assert("nikita-1001", lchild->parent);
+		assert("nikita-1003", !lchild->left);
+		left = reiser4_carry_real(lchild);
+	} else
+		left = NULL;
+
+	tree = znode_get_tree(rchild->node);
+	read_lock_tree(tree);
+	right = znode_parent(rchild->node);
+	read_unlock_tree(tree);
+
+	if (right != NULL) {
+		result = update_delimiting_key(right,
+					       lchild ? lchild->node : NULL,
+					       rchild->node,
+					       doing, todo, &error_msg);
+	} else {
+		error_msg = "Cannot find node to update key in";
+		result = RETERR(-EIO);
+	}
+	/* operation will be reposted to the next level by the
+	   ->update_item_key() method of node plugin, if necessary. */
+
+	if (result != 0) {
+		warning("nikita-999", "Error updating delimiting key: %s (%i)",
+			error_msg ? : "", result);
+	}
+	return result;
+}
+
+/* move items from @node during carry */
+static int carry_shift_data(sideof side /* in what direction to move data */ ,
+			    coord_t * insert_coord	/* coord where new item
+							 * is to be inserted */ ,
+			    znode * node /* node which data are moved from */ ,
+			    carry_level * doing /* active carry queue */ ,
+			    carry_level * todo	/* carry queue where new
+						 * operations are to be put
+						 * in */ ,
+			    unsigned int including_insert_coord_p	/* true if
+									 * @insertion_coord
+									 * can be moved */ )
+{
+	int result;
+	znode *source;
+	carry_plugin_info info;
+	node_plugin *nplug;
+
+	source = insert_coord->node;
+
+	info.doing = doing;
+	info.todo = todo;
+
+	nplug = node_plugin_by_node(node);
+	result = nplug->shift(insert_coord, node,
+			      (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
+			      (int)including_insert_coord_p, &info);
+	/* the only error ->shift() method of node plugin can return is
+	   -ENOMEM due to carry node/operation allocation. */
+	assert("nikita-915", result >= 0 || result == -ENOMEM);
+	if (result > 0) {
+		/*
+		 * if some number of bytes was actually shifted, mark nodes
+		 * dirty, and carry level as non-restartable.
+		 */
+		doing->restartable = 0;
+		znode_make_dirty(source);
+		znode_make_dirty(node);
+	}
+
+	assert("nikita-2077", coord_check(insert_coord));
+	return 0;
+}
+
+typedef carry_node *(*carry_iterator) (carry_node * node);
+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
+				  carry_iterator iterator);
+
+static carry_node *pool_level_list_prev(carry_node *node)
+{
+	return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
+}
+
+/* look for the left neighbor of given carry node in a carry queue.
+
+   This is used by find_left_neighbor(), but I am not sure that this
+   really gives any advantage. More statistics required.
+
+*/
+carry_node *find_left_carry(carry_node * node	/* node to find left neighbor
+						 * of */ ,
+			    carry_level * level /* level to scan */ )
+{
+	return find_dir_carry(node, level,
+			      (carry_iterator) pool_level_list_prev);
+}
+
+static carry_node *pool_level_list_next(carry_node *node)
+{
+	return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
+}
+
+/* look for the right neighbor of given carry node in a
+   carry queue.
+
+   This is used by find_right_neighbor(), but I am not sure that this
+   really gives any advantage. More statistics required.
+
+*/
+carry_node *find_right_carry(carry_node * node	/* node to find right neighbor
+						 * of */ ,
+			     carry_level * level /* level to scan */ )
+{
+	return find_dir_carry(node, level,
+			      (carry_iterator) pool_level_list_next);
+}
+
+/* look for the left or right neighbor of given carry node in a carry
+   queue.
+
+   Helper function used by find_{left|right}_carry().
+*/
+static carry_node *find_dir_carry(carry_node * node	/* node to start scanning
+							 * from */ ,
+				  carry_level * level /* level to scan */ ,
+				  carry_iterator iterator	/* operation to
+								 * move to the next
+								 * node */ )
+{
+	carry_node *neighbor;
+
+	assert("nikita-1059", node != NULL);
+	assert("nikita-1060", level != NULL);
+
+	/* scan list of carry nodes on this list dir-ward, skipping all
+	   carry nodes referencing the same znode. */
+	neighbor = node;
+	while (1) {
+		neighbor = iterator(neighbor);
+		if (carry_node_end(level, neighbor))
+			/* list head is reached */
+			return NULL;
+		if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
+			return neighbor;
+	}
+}
+
+/*
+ * Memory reservation estimation.
+ *
+ * Carry process proceeds through tree levels upwards. Carry assumes that it
+ * takes tree in consistent state (e.g., that search tree invariants hold),
+ * and leaves tree consistent after it finishes. This means that when some
+ * error occurs carry cannot simply return if there are pending carry
+ * operations. Generic solution for this problem is carry-undo either as
+ * transaction manager feature (requiring checkpoints and isolation), or
+ * through some carry specific mechanism.
+ *
+ * Our current approach is to panic if carry hits an error while tree is
+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
+ * this "memory reservation" mechanism was added.
+ *
+ * Memory reservation is implemented by perthread-pages.diff patch from
+ * core-patches. Its API is defined in <linux/gfp.h>
+ *
+ *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
+ *     void perthread_pages_release(int nrpages);
+ *     int  perthread_pages_count(void);
+ *
+ * carry estimates its worst case memory requirements at the entry, reserved
+ * enough memory, and released unused pages before returning.
+ *
+ * Code below estimates worst case memory requirements for a given carry
+ * queue. This is dome by summing worst case memory requirements for each
+ * operation in the queue.
+ *
+ */
+
+/*
+ * Memory memory requirements of many operations depends on the tree
+ * height. For example, item insertion requires new node to be inserted at
+ * each tree level in the worst case. What tree height should be used for
+ * estimation? Current tree height is wrong, because tree height can change
+ * between the time when estimation was done and the time when operation is
+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
+ * is also not desirable, because it would lead to the huge over-estimation
+ * all the time. Plausible solution is "capped tree height": if current tree
+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
+ * to be increased even more during short interval of time.
+ */
+#define TREE_HEIGHT_CAP (5)
+
+/* return capped tree height for the @tree. See comment above. */
+static int cap_tree_height(reiser4_tree * tree)
+{
+	return max_t(int, tree->height, TREE_HEIGHT_CAP);
+}
+
+/* return capped tree height for the current tree. */
+static int capped_height(void)
+{
+	return cap_tree_height(current_tree);
+}
+
+/* return number of pages required to store given number of bytes */
+static int bytes_to_pages(int bytes)
+{
+	return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+/* how many pages are required to allocate znodes during item insertion. */
+static int carry_estimate_znodes(void)
+{
+	/*
+	 * Note, that there we have some problem here: there is no way to
+	 * reserve pages specifically for the given slab. This means that
+	 * these pages can be hijacked for some other end.
+	 */
+
+	/* in the worst case we need 3 new znode on each tree level */
+	return bytes_to_pages(capped_height() * sizeof(znode) * 3);
+}
+
+/*
+ * how many pages are required to load bitmaps. One bitmap per level.
+ */
+static int carry_estimate_bitmaps(void)
+{
+	if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
+		int bytes;
+
+		bytes = capped_height() * (0 +	/* bnode should be added, but its is private to
+						 * bitmap.c, skip for now. */
+					   2 * sizeof(jnode));	/* working and commit jnodes */
+		return bytes_to_pages(bytes) + 2;	/* and their contents */
+	} else
+		/* bitmaps were pre-loaded during mount */
+		return 0;
+}
+
+/* worst case item insertion memory requirements */
+static int carry_estimate_insert(carry_op * op, carry_level * level)
+{
+	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
+	    capped_height() +	/* new block on each level */
+	    1 +			/* and possibly extra new block at the leaf level */
+	    3;			/* loading of leaves into memory */
+}
+
+/* worst case item deletion memory requirements */
+static int carry_estimate_delete(carry_op * op, carry_level * level)
+{
+	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
+	    3;			/* loading of leaves into memory */
+}
+
+/* worst case tree cut memory requirements */
+static int carry_estimate_cut(carry_op * op, carry_level * level)
+{
+	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
+	    3;			/* loading of leaves into memory */
+}
+
+/* worst case memory requirements of pasting into item */
+static int carry_estimate_paste(carry_op * op, carry_level * level)
+{
+	return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +	/* new atom */
+	    capped_height() +	/* new block on each level */
+	    1 +			/* and possibly extra new block at the leaf level */
+	    3;			/* loading of leaves into memory */
+}
+
+/* worst case memory requirements of extent insertion */
+static int carry_estimate_extent(carry_op * op, carry_level * level)
+{
+	return carry_estimate_insert(op, level) +	/* insert extent */
+	    carry_estimate_delete(op, level);	/* kill leaf */
+}
+
+/* worst case memory requirements of key update */
+static int carry_estimate_update(carry_op * op, carry_level * level)
+{
+	return 0;
+}
+
+/* worst case memory requirements of flow insertion */
+static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
+{
+	int newnodes;
+
+	newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
+		       CARRY_FLOW_NEW_NODES_LIMIT);
+	/*
+	 * roughly estimate insert_flow as a sequence of insertions.
+	 */
+	return newnodes * carry_estimate_insert(op, level);
+}
+
+/* This is dispatch table for carry operations. It can be trivially
+   abstracted into useful plugin: tunable balancing policy is a good
+   thing. */
+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
+	[COP_INSERT] = {
+			.handler = carry_insert,
+			.estimate = carry_estimate_insert}
+	,
+	[COP_DELETE] = {
+			.handler = carry_delete,
+			.estimate = carry_estimate_delete}
+	,
+	[COP_CUT] = {
+		     .handler = carry_cut,
+		     .estimate = carry_estimate_cut}
+	,
+	[COP_PASTE] = {
+		       .handler = carry_paste,
+		       .estimate = carry_estimate_paste}
+	,
+	[COP_EXTENT] = {
+			.handler = carry_extent,
+			.estimate = carry_estimate_extent}
+	,
+	[COP_UPDATE] = {
+			.handler = carry_update,
+			.estimate = carry_estimate_update}
+	,
+	[COP_INSERT_FLOW] = {
+			     .handler = carry_insert_flow,
+			     .estimate = carry_estimate_insert_flow}
+};
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/carry_ops.h linux-2.6.22/fs/reiser4/carry_ops.h
--- linux-2.6.22.orig/fs/reiser4/carry_ops.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/carry_ops.h	2007-07-29 00:25:34.828684053 +0400
@@ -0,0 +1,42 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* implementation of carry operations. See carry_ops.c for details. */
+
+#if !defined( __CARRY_OPS_H__ )
+#define __CARRY_OPS_H__
+
+#include "forward.h"
+#include "znode.h"
+#include "carry.h"
+
+/* carry operation handlers */
+typedef struct carry_op_handler {
+	/* perform operation */
+	int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
+	/* estimate memory requirements for @op */
+	int (*estimate) (carry_op * op, carry_level * level);
+} carry_op_handler;
+
+/* This is dispatch table for carry operations. It can be trivially
+   abstracted into useful plugin: tunable balancing policy is a good
+   thing. */
+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
+
+unsigned int space_needed(const znode * node, const coord_t * coord,
+			  const reiser4_item_data * data, int inserting);
+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
+
+/* __CARRY_OPS_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/context.c linux-2.6.22/fs/reiser4/context.c
--- linux-2.6.22.orig/fs/reiser4/context.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/context.c	2007-07-29 00:25:34.832685088 +0400
@@ -0,0 +1,288 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Manipulation of reiser4_context */
+
+/*
+ * global context used during system call. Variable of this type is allocated
+ * on the stack at the beginning of the reiser4 part of the system call and
+ * pointer to it is stored in the current->fs_context. This allows us to avoid
+ * passing pointer to current transaction and current lockstack (both in
+ * one-to-one mapping with threads) all over the call chain.
+ *
+ * It's kind of like those global variables the prof used to tell you not to
+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
+ *
+ * In some situations it is desirable to have ability to enter reiser4_context
+ * more than once for the same thread (nested contexts). For example, there
+ * are some functions that can be called either directly from VFS/VM or from
+ * already active reiser4 context (->writepage, for example).
+ *
+ * In such situations "child" context acts like dummy: all activity is
+ * actually performed in the top level context, and get_current_context()
+ * always returns top level context.
+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
+ * nested any way.
+ *
+ * Note that there is an important difference between reiser4 uses
+ * ->fs_context and the way other file systems use it. Other file systems
+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
+ * (this is why ->fs_context was initially called ->journal_info). This means,
+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
+ * to the file system, they assume that some transaction is already underway,
+ * and usually bail out, because starting nested transaction would most likely
+ * lead to the deadlock. This gives false positives with reiser4, because we
+ * set ->fs_context before starting transaction.
+ */
+
+#include "debug.h"
+#include "super.h"
+#include "context.h"
+
+#include <linux/writeback.h>	/* balance_dirty_pages() */
+#include <linux/hardirq.h>
+
+static void _reiser4_init_context(reiser4_context * context,
+				  struct super_block *super)
+{
+	memset(context, 0, sizeof(*context));
+
+	context->super = super;
+	context->magic = context_magic;
+	context->outer = current->journal_info;
+	current->journal_info = (void *)context;
+	context->nr_children = 0;
+	context->gfp_mask = GFP_KERNEL;
+
+	init_lock_stack(&context->stack);
+
+	reiser4_txn_begin(context);
+
+	/* initialize head of tap list */
+	INIT_LIST_HEAD(&context->taps);
+#if REISER4_DEBUG
+	context->task = current;
+#endif
+	grab_space_enable();
+}
+
+/* initialize context and bind it to the current thread
+
+   This function should be called at the beginning of reiser4 part of
+   syscall.
+*/
+reiser4_context * reiser4_init_context(struct super_block * super)
+{
+	reiser4_context *context;
+
+	assert("nikita-2662", !in_interrupt() && !in_irq());
+	assert("nikita-3357", super != NULL);
+	assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
+
+	context = get_current_context_check();
+	if (context && context->super == super) {
+		context = (reiser4_context *) current->journal_info;
+		context->nr_children++;
+		return context;
+	}
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (context == NULL)
+		return ERR_PTR(RETERR(-ENOMEM));
+
+	_reiser4_init_context(context, super);
+	return context;
+}
+
+/* this is used in scan_mgr which is called with spinlock held and in
+   reiser4_fill_super magic */
+void init_stack_context(reiser4_context *context, struct super_block *super)
+{
+	assert("nikita-2662", !in_interrupt() && !in_irq());
+	assert("nikita-3357", super != NULL);
+	assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
+	assert("vs-12", !is_in_reiser4_context());
+
+	_reiser4_init_context(context, super);
+	context->on_stack = 1;
+	return;
+}
+
+/* cast lock stack embedded into reiser4 context up to its container */
+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
+{
+	return container_of(owner, reiser4_context, stack);
+}
+
+/* true if there is already _any_ reiser4 context for the current thread */
+int is_in_reiser4_context(void)
+{
+	reiser4_context *ctx;
+
+	ctx = current->journal_info;
+	return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
+}
+
+/*
+ * call balance dirty pages for the current context.
+ *
+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
+ * write---this covers vast majority of all dirty traffic), but we cannot do
+ * this immediately when formatted node is dirtied, because long term lock is
+ * usually held at that time. To work around this, dirtying of formatted node
+ * simply increases ->nr_marked_dirty counter in the current reiser4
+ * context. When we are about to leave this context,
+ * balance_dirty_pages_ratelimited() is called, if necessary.
+ *
+ * This introduces another problem: sometimes we do not want to run
+ * balance_dirty_pages_ratelimited() when leaving a context, for example
+ * because some important lock (like ->i_mutex on the parent directory) is
+ * held. To achieve this, ->nobalance flag can be set in the current context.
+ */
+static void balance_dirty_pages_at(reiser4_context *context)
+{
+	reiser4_super_info_data *sbinfo = get_super_private(context->super);
+
+	/*
+	 * call balance_dirty_pages_ratelimited() to process formatted nodes
+	 * dirtied during this system call. Do that only if we are not in mount
+	 * and there were nodes dirtied in this context and we are not in
+	 * writepage (to avoid deadlock) and not in pdflush
+	 */
+	if (sbinfo != NULL && sbinfo->fake != NULL &&
+	    context->nr_marked_dirty != 0 &&
+	    !(current->flags & PF_MEMALLOC) &&
+	    !current_is_pdflush())
+		balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
+}
+
+/* release resources associated with context.
+
+   This function should be called at the end of "session" with reiser4,
+   typically just before leaving reiser4 driver back to VFS.
+
+   This is good place to put some degugging consistency checks, like that
+   thread released all locks and closed transcrash etc.
+
+*/
+static void reiser4_done_context(reiser4_context * context /* context being released */ )
+{
+	assert("nikita-860", context != NULL);
+	assert("nikita-859", context->magic == context_magic);
+	assert("vs-646", (reiser4_context *) current->journal_info == context);
+	assert("zam-686", !in_interrupt() && !in_irq());
+
+	/* only do anything when leaving top-level reiser4 context. All nested
+	 * contexts are just dummies. */
+	if (context->nr_children == 0) {
+		assert("jmacd-673", context->trans == NULL);
+		assert("jmacd-1002", lock_stack_isclean(&context->stack));
+		assert("nikita-1936", reiser4_no_counters_are_held());
+		assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
+		assert("zam-1004", ergo(get_super_private(context->super),
+					get_super_private(context->super)->delete_mutex_owner !=
+					current));
+
+		/* release all grabbed but as yet unused blocks */
+		if (context->grabbed_blocks != 0)
+			all_grabbed2free();
+
+		/*
+		 * synchronize against longterm_unlock_znode():
+		 * wake_up_requestor() wakes up requestors without holding
+		 * zlock (otherwise they will immediately bump into that lock
+		 * after wake up on another CPU). To work around (rare)
+		 * situation where requestor has been woken up asynchronously
+		 * and managed to run until completion (and destroy its
+		 * context and lock stack) before wake_up_requestor() called
+		 * wake_up() on it, wake_up_requestor() synchronize on lock
+		 * stack spin lock. It has actually been observed that spin
+		 * lock _was_ locked at this point, because
+		 * wake_up_requestor() took interrupt.
+		 */
+		spin_lock_stack(&context->stack);
+		spin_unlock_stack(&context->stack);
+
+		assert("zam-684", context->nr_children == 0);
+		/* restore original ->fs_context value */
+		current->journal_info = context->outer;
+		if (context->on_stack == 0)
+			kfree(context);
+	} else {
+		context->nr_children--;
+#if REISER4_DEBUG
+		assert("zam-685", context->nr_children >= 0);
+#endif
+	}
+}
+
+/*
+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
+ * transaction. Call done_context() to do context related book-keeping.
+ */
+void reiser4_exit_context(reiser4_context * context)
+{
+	assert("nikita-3021", reiser4_schedulable());
+
+	if (context->nr_children == 0) {
+		if (!context->nobalance) {
+			reiser4_txn_restart(context);
+			balance_dirty_pages_at(context);
+		}
+
+		/* if filesystem is mounted with -o sync or -o dirsync - commit
+		   transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
+		   commiting on exit_context when inode semaphore is held and
+		   to have ktxnmgrd to do commit instead to get better
+		   concurrent filesystem accesses. But, when one mounts with -o
+		   sync, he cares more about reliability than about
+		   performance. So, for now we have this simple mount -o sync
+		   support. */
+		if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
+			txn_atom *atom;
+
+			atom = get_current_atom_locked_nocheck();
+			if (atom) {
+				atom->flags |= ATOM_FORCE_COMMIT;
+				context->trans->flags &= ~TXNH_DONT_COMMIT;
+				spin_unlock_atom(atom);
+			}
+		}
+		reiser4_txn_end(context);
+	}
+	reiser4_done_context(context);
+}
+
+void reiser4_ctx_gfp_mask_set(void)
+{
+	reiser4_context *ctx;
+
+	ctx = get_current_context();
+	if (ctx->entd == 0 &&
+	    list_empty(&ctx->stack.locks) &&
+	    ctx->trans->atom == NULL)
+		ctx->gfp_mask = GFP_KERNEL;
+	else
+		ctx->gfp_mask = GFP_NOFS;
+}
+
+void reiser4_ctx_gfp_mask_force (gfp_t mask)
+{
+	reiser4_context *ctx;
+	ctx = get_current_context();
+
+	assert("edward-1454", ctx != NULL);
+
+	ctx->gfp_mask = mask;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * scroll-step: 1
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/context.h linux-2.6.22/fs/reiser4/context.h
--- linux-2.6.22.orig/fs/reiser4/context.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/context.h	2007-07-29 00:25:34.832685088 +0400
@@ -0,0 +1,228 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Reiser4 context. See context.c for details. */
+
+#if !defined( __REISER4_CONTEXT_H__ )
+#define __REISER4_CONTEXT_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "tap.h"
+#include "lock.h"
+
+#include <linux/types.h>	/* for __u??  */
+#include <linux/fs.h>		/* for struct super_block  */
+#include <linux/spinlock.h>
+#include <linux/sched.h>	/* for struct task_struct */
+
+/* reiser4 per-thread context */
+struct reiser4_context {
+	/* magic constant. For identification of reiser4 contexts. */
+	__u32 magic;
+
+	/* current lock stack. See lock.[ch]. This is where list of all
+	   locks taken by current thread is kept. This is also used in
+	   deadlock detection. */
+	lock_stack stack;
+
+	/* current transcrash. */
+	txn_handle *trans;
+	/* transaction handle embedded into reiser4_context. ->trans points
+	 * here by default. */
+	txn_handle trans_in_ctx;
+
+	/* super block we are working with.  To get the current tree
+	   use &get_super_private (reiser4_get_current_sb ())->tree. */
+	struct super_block *super;
+
+	/* parent fs activation */
+	struct fs_activation *outer;
+
+	/* per-thread grabbed (for further allocation) blocks counter */
+	reiser4_block_nr grabbed_blocks;
+
+	/* list of taps currently monitored. See tap.c */
+	struct list_head taps;
+
+	/* grabbing space is enabled */
+	unsigned int grab_enabled:1;
+	/* should be set when we are write dirty nodes to disk in jnode_flush or
+	 * reiser4_write_logs() */
+	unsigned int writeout_mode:1;
+	/* true, if current thread is an ent thread */
+	unsigned int entd:1;
+	/* true, if balance_dirty_pages() should not be run when leaving this
+	 * context. This is used to avoid lengthly balance_dirty_pages()
+	 * operation when holding some important resource, like directory
+	 * ->i_mutex */
+	unsigned int nobalance:1;
+
+	/* this bit is used on reiser4_done_context to decide whether context is
+	   kmalloc-ed and has to be kfree-ed */
+	unsigned int on_stack:1;
+
+	/* count non-trivial jnode_set_dirty() calls */
+	unsigned long nr_marked_dirty;
+
+	/* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
+	 * reiser4_writepages for each of dirty inodes. Reiser4_writepages
+	 * captures pages. When number of pages captured in one
+	 * reiser4_sync_inodes reaches some threshold - some atoms get
+	 * flushed */
+	int nr_captured;
+	int nr_children;	/* number of child contexts */
+#if REISER4_DEBUG
+	/* debugging information about reiser4 locks held by the current
+	 * thread */
+	reiser4_lock_cnt_info locks;
+	struct task_struct *task;	/* so we can easily find owner of the stack */
+
+	/*
+	 * disk space grabbing debugging support
+	 */
+	/* how many disk blocks were grabbed by the first call to
+	 * reiser4_grab_space() in this context */
+	reiser4_block_nr grabbed_initially;
+
+	/* list of all threads doing flush currently */
+	struct list_head flushers_link;
+	/* information about last error encountered by reiser4 */
+	err_site err;
+#endif
+	void *vp;
+	gfp_t gfp_mask;
+};
+
+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
+
+/* Debugging helps. */
+#if REISER4_DEBUG
+extern void print_contexts(void);
+#endif
+
+#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
+#define current_blocksize reiser4_get_current_sb()->s_blocksize
+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
+
+extern reiser4_context *reiser4_init_context(struct super_block *);
+extern void init_stack_context(reiser4_context *, struct super_block *);
+extern void reiser4_exit_context(reiser4_context *);
+
+/* magic constant we store in reiser4_context allocated at the stack. Used to
+   catch accesses to staled or uninitialized contexts. */
+#define context_magic ((__u32) 0x4b1b5d0b)
+
+extern int is_in_reiser4_context(void);
+
+/*
+ * return reiser4_context for the thread @tsk
+ */
+static inline reiser4_context *get_context(const struct task_struct *tsk)
+{
+	assert("vs-1682",
+	       ((reiser4_context *) tsk->journal_info)->magic == context_magic);
+	return (reiser4_context *) tsk->journal_info;
+}
+
+/*
+ * return reiser4 context of the current thread, or NULL if there is none.
+ */
+static inline reiser4_context *get_current_context_check(void)
+{
+	if (is_in_reiser4_context())
+		return get_context(current);
+	else
+		return NULL;
+}
+
+static inline reiser4_context *get_current_context(void);	/* __attribute__((const)); */
+
+/* return context associated with current thread */
+static inline reiser4_context *get_current_context(void)
+{
+	return get_context(current);
+}
+
+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
+{
+	reiser4_context *ctx;
+
+	ctx = get_current_context_check();
+	return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
+}
+
+void reiser4_ctx_gfp_mask_set(void);
+void reiser4_ctx_gfp_mask_force (gfp_t mask);
+
+/*
+ * true if current thread is in the write-out mode. Thread enters write-out
+ * mode during jnode_flush and reiser4_write_logs().
+ */
+static inline int is_writeout_mode(void)
+{
+	return get_current_context()->writeout_mode;
+}
+
+/*
+ * enter write-out mode
+ */
+static inline void writeout_mode_enable(void)
+{
+	assert("zam-941", !get_current_context()->writeout_mode);
+	get_current_context()->writeout_mode = 1;
+}
+
+/*
+ * leave write-out mode
+ */
+static inline void writeout_mode_disable(void)
+{
+	assert("zam-942", get_current_context()->writeout_mode);
+	get_current_context()->writeout_mode = 0;
+}
+
+static inline void grab_space_enable(void)
+{
+	get_current_context()->grab_enabled = 1;
+}
+
+static inline void grab_space_disable(void)
+{
+	get_current_context()->grab_enabled = 0;
+}
+
+static inline void grab_space_set_enabled(int enabled)
+{
+	get_current_context()->grab_enabled = enabled;
+}
+
+static inline int is_grab_enabled(reiser4_context * ctx)
+{
+	return ctx->grab_enabled;
+}
+
+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
+ * flush would be performed when it is closed. This is necessary when handle
+ * has to be closed under some coarse semaphore, like i_mutex of
+ * directory. Commit will be performed by ktxnmgrd. */
+static inline void context_set_commit_async(reiser4_context * context)
+{
+	context->nobalance = 1;
+	context->trans->flags |= TXNH_DONT_COMMIT;
+}
+
+/* __REISER4_CONTEXT_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/coord.c linux-2.6.22/fs/reiser4/coord.c
--- linux-2.6.22.orig/fs/reiser4/coord.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/coord.c	2007-07-29 00:25:34.832685088 +0400
@@ -0,0 +1,935 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "tree.h"
+#include "plugin/item/item.h"
+#include "znode.h"
+#include "coord.h"
+
+/* Internal constructor. */
+static inline void
+coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
+		  pos_in_node_t unit_pos, between_enum between)
+{
+	coord->node = (znode *) node;
+	coord_set_item_pos(coord, item_pos);
+	coord->unit_pos = unit_pos;
+	coord->between = between;
+	ON_DEBUG(coord->plug_v = 0);
+	ON_DEBUG(coord->body_v = 0);
+
+	/*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
+}
+
+/* after shifting of node content, coord previously set properly may become
+   invalid, try to "normalize" it. */
+void coord_normalize(coord_t * coord)
+{
+	znode *node;
+
+	node = coord->node;
+	assert("vs-683", node);
+
+	coord_clear_iplug(coord);
+
+	if (node_is_empty(node)) {
+		coord_init_first_unit(coord, node);
+	} else if ((coord->between == AFTER_ITEM)
+		   || (coord->between == AFTER_UNIT)) {
+		return;
+	} else if (coord->item_pos == coord_num_items(coord)
+		   && coord->between == BEFORE_ITEM) {
+		coord_dec_item_pos(coord);
+		coord->between = AFTER_ITEM;
+	} else if (coord->unit_pos == coord_num_units(coord)
+		   && coord->between == BEFORE_UNIT) {
+		coord->unit_pos--;
+		coord->between = AFTER_UNIT;
+	} else if (coord->item_pos == coord_num_items(coord)
+		   && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
+		coord_dec_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AFTER_ITEM;
+	}
+}
+
+/* Copy a coordinate. */
+void coord_dup(coord_t * coord, const coord_t * old_coord)
+{
+	assert("jmacd-9800", coord_check(old_coord));
+	coord_dup_nocheck(coord, old_coord);
+}
+
+/* Copy a coordinate without check. Useful when old_coord->node is not
+   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
+{
+	coord->node = old_coord->node;
+	coord_set_item_pos(coord, old_coord->item_pos);
+	coord->unit_pos = old_coord->unit_pos;
+	coord->between = old_coord->between;
+	coord->iplugid = old_coord->iplugid;
+	ON_DEBUG(coord->plug_v = old_coord->plug_v);
+	ON_DEBUG(coord->body_v = old_coord->body_v);
+}
+
+/* Initialize an invalid coordinate. */
+void coord_init_invalid(coord_t * coord, const znode * node)
+{
+	coord_init_values(coord, node, 0, 0, INVALID_COORD);
+}
+
+void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
+{
+	coord_init_values(coord, node, 0, 0, AT_UNIT);
+}
+
+/* Initialize a coordinate to point at the first unit of the first item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+void coord_init_first_unit(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
+
+	assert("jmacd-9801", coord_check(coord));
+}
+
+/* Initialize a coordinate to point at the last unit of the last item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+void coord_init_last_unit(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node,
+			  (is_empty ? 0 : node_num_items(node) - 1), 0,
+			  (is_empty ? EMPTY_NODE : AT_UNIT));
+	if (!is_empty)
+		coord->unit_pos = coord_last_unit_pos(coord);
+	assert("jmacd-9802", coord_check(coord));
+}
+
+/* Initialize a coordinate to before the first item.  If the node is empty, it is
+   positioned at the EMPTY_NODE. */
+void coord_init_before_first_item(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node, 0, 0,
+			  (is_empty ? EMPTY_NODE : BEFORE_UNIT));
+
+	assert("jmacd-9803", coord_check(coord));
+}
+
+/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
+   at the EMPTY_NODE. */
+void coord_init_after_last_item(coord_t * coord, const znode * node)
+{
+	int is_empty = node_is_empty(node);
+
+	coord_init_values(coord, node,
+			  (is_empty ? 0 : node_num_items(node) - 1), 0,
+			  (is_empty ? EMPTY_NODE : AFTER_ITEM));
+
+	assert("jmacd-9804", coord_check(coord));
+}
+
+/* Initialize a coordinate to after last unit in the item. Coord must be set
+   already to existing item */
+void coord_init_after_item_end(coord_t * coord)
+{
+	coord->between = AFTER_UNIT;
+	coord->unit_pos = coord_last_unit_pos(coord);
+}
+
+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
+void coord_init_before_item(coord_t * coord)
+{
+	coord->unit_pos = 0;
+	coord->between = BEFORE_ITEM;
+}
+
+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
+void coord_init_after_item(coord_t * coord)
+{
+	coord->unit_pos = 0;
+	coord->between = AFTER_ITEM;
+}
+
+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
+   it was not clear how actually */
+void coord_init_zero(coord_t * coord)
+{
+	memset(coord, 0, sizeof(*coord));
+}
+
+/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
+unsigned coord_num_units(const coord_t * coord)
+{
+	assert("jmacd-9806", coord_is_existing_item(coord));
+
+	return item_plugin_by_coord(coord)->b.nr_units(coord);
+}
+
+/* Returns true if the coord was initializewd by coord_init_invalid (). */
+/* Audited by: green(2002.06.15) */
+int coord_is_invalid(const coord_t * coord)
+{
+	return coord->between == INVALID_COORD;
+}
+
+/* Returns true if the coordinate is positioned at an existing item, not before or after
+   an item.  It may be placed at, before, or after any unit within the item, whether
+   existing or not. */
+int coord_is_existing_item(const coord_t * coord)
+{
+	switch (coord->between) {
+	case EMPTY_NODE:
+	case BEFORE_ITEM:
+	case AFTER_ITEM:
+	case INVALID_COORD:
+		return 0;
+
+	case BEFORE_UNIT:
+	case AT_UNIT:
+	case AFTER_UNIT:
+		return coord->item_pos < coord_num_items(coord);
+	}
+
+	impossible("jmacd-9900", "unreachable coord: %p", coord);
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
+   unit. */
+/* Audited by: green(2002.06.15) */
+int coord_is_existing_unit(const coord_t * coord)
+{
+	switch (coord->between) {
+	case EMPTY_NODE:
+	case BEFORE_UNIT:
+	case AFTER_UNIT:
+	case BEFORE_ITEM:
+	case AFTER_ITEM:
+	case INVALID_COORD:
+		return 0;
+
+	case AT_UNIT:
+		return (coord->item_pos < coord_num_items(coord)
+			&& coord->unit_pos < coord_num_units(coord));
+	}
+
+	impossible("jmacd-9902", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
+   true for empty nodes nor coordinates positioned before the first item. */
+/* Audited by: green(2002.06.15) */
+int coord_is_leftmost_unit(const coord_t * coord)
+{
+	return (coord->between == AT_UNIT && coord->item_pos == 0
+		&& coord->unit_pos == 0);
+}
+
+#if REISER4_DEBUG
+/* For assertions only, checks for a valid coordinate. */
+int coord_check(const coord_t * coord)
+{
+	if (coord->node == NULL) {
+		return 0;
+	}
+	if (znode_above_root(coord->node))
+		return 1;
+
+	switch (coord->between) {
+	default:
+	case INVALID_COORD:
+		return 0;
+	case EMPTY_NODE:
+		if (!node_is_empty(coord->node)) {
+			return 0;
+		}
+		return coord->item_pos == 0 && coord->unit_pos == 0;
+
+	case BEFORE_UNIT:
+	case AFTER_UNIT:
+		if (node_is_empty(coord->node) && (coord->item_pos == 0)
+		    && (coord->unit_pos == 0))
+			return 1;
+	case AT_UNIT:
+		break;
+	case AFTER_ITEM:
+	case BEFORE_ITEM:
+		/* before/after item should not set unit_pos. */
+		if (coord->unit_pos != 0) {
+			return 0;
+		}
+		break;
+	}
+
+	if (coord->item_pos >= node_num_items(coord->node)) {
+		return 0;
+	}
+
+	/* FIXME-VS: we are going to check unit_pos. This makes no sense when
+	   between is set either AFTER_ITEM or BEFORE_ITEM */
+	if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
+		return 1;
+
+	if (coord_is_iplug_set(coord) &&
+	    coord->unit_pos >
+	    item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
+   Returns 1 if the new position is does not exist. */
+static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
+{
+	/* If the node is invalid, leave it. */
+	if (coord->between == INVALID_COORD) {
+		return 1;
+	}
+
+	/* If the node is empty, set it appropriately. */
+	if (items == 0) {
+		coord->between = EMPTY_NODE;
+		coord_set_item_pos(coord, 0);
+		coord->unit_pos = 0;
+		return 1;
+	}
+
+	/* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
+	if (coord->between == EMPTY_NODE) {
+		coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
+		coord_set_item_pos(coord, 0);
+		coord->unit_pos = 0;
+		return 0;
+	}
+
+	/* If the item_pos is out-of-range, set it appropriatly. */
+	if (coord->item_pos >= items) {
+		coord->between = AFTER_ITEM;
+		coord_set_item_pos(coord, items - 1);
+		coord->unit_pos = 0;
+		/* If is_next, return 1 (can't go any further). */
+		return is_next;
+	}
+
+	return 0;
+}
+
+/* Advances the coordinate by one unit to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
+   existing unit. */
+int coord_next_unit(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 1) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case BEFORE_UNIT:
+		/* Now it is positioned at the same unit. */
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_UNIT:
+	case AT_UNIT:
+		/* If it was at or after a unit and there are more units in this item,
+		   advance to the next one. */
+		if (coord->unit_pos < coord_last_unit_pos(coord)) {
+			coord->unit_pos += 1;
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+		/* Otherwise, it is crossing an item boundary and treated as if it was
+		   after the current item. */
+		coord->between = AFTER_ITEM;
+		coord->unit_pos = 0;
+		/* FALLTHROUGH */
+
+	case AFTER_ITEM:
+		/* Check for end-of-node. */
+		if (coord->item_pos == items - 1) {
+			return 1;
+		}
+
+		coord_inc_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case BEFORE_ITEM:
+		/* The adjust_items checks ensure that we are valid here. */
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		/* Handled in coord_adjust_items(). */
+		break;
+	}
+
+	impossible("jmacd-9902", "unreachable");
+	return 0;
+}
+
+/* Advances the coordinate by one item to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
+   an existing item. */
+int coord_next_item(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 1) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AFTER_UNIT:
+	case AT_UNIT:
+	case BEFORE_UNIT:
+	case AFTER_ITEM:
+		/* Check for end-of-node. */
+		if (coord->item_pos == items - 1) {
+			coord->between = AFTER_ITEM;
+			coord->unit_pos = 0;
+			coord_clear_iplug(coord);
+			return 1;
+		}
+
+		/* Anywhere in an item, go to the next one. */
+		coord->between = AT_UNIT;
+		coord_inc_item_pos(coord);
+		coord->unit_pos = 0;
+		return 0;
+
+	case BEFORE_ITEM:
+		/* The out-of-range check ensures that we are valid here. */
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		/* Handled in coord_adjust_items(). */
+		break;
+	}
+
+	impossible("jmacd-9903", "unreachable");
+	return 0;
+}
+
+/* Advances the coordinate by one unit to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing unit. */
+int coord_prev_unit(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 0) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+	case BEFORE_UNIT:
+		if (coord->unit_pos > 0) {
+			coord->unit_pos -= 1;
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+		if (coord->item_pos == 0) {
+			coord->between = BEFORE_ITEM;
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		coord->unit_pos = coord_last_unit_pos(coord);
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_UNIT:
+		/* What if unit_pos is out-of-range? */
+		assert("jmacd-5442",
+		       coord->unit_pos <= coord_last_unit_pos(coord));
+		coord->between = AT_UNIT;
+		return 0;
+
+	case BEFORE_ITEM:
+		if (coord->item_pos == 0) {
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		/* FALLTHROUGH */
+
+	case AFTER_ITEM:
+		coord->between = AT_UNIT;
+		coord->unit_pos = coord_last_unit_pos(coord);
+		return 0;
+
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		break;
+	}
+
+	impossible("jmacd-9904", "unreachable");
+	return 0;
+}
+
+/* Advances the coordinate by one item to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing item. */
+int coord_prev_item(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 0) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+	case AFTER_UNIT:
+	case BEFORE_UNIT:
+	case BEFORE_ITEM:
+
+		if (coord->item_pos == 0) {
+			coord->between = BEFORE_ITEM;
+			coord->unit_pos = 0;
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_ITEM:
+		coord->between = AT_UNIT;
+		coord->unit_pos = 0;
+		return 0;
+
+	case INVALID_COORD:
+	case EMPTY_NODE:
+		break;
+	}
+
+	impossible("jmacd-9905", "unreachable");
+	return 0;
+}
+
+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
+void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
+{
+	assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+	if (dir == LEFT_SIDE) {
+		coord_init_first_unit(coord, node);
+	} else {
+		coord_init_last_unit(coord, node);
+	}
+}
+
+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
+   argument. */
+/* Audited by: green(2002.06.15) */
+int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
+{
+	assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+	if (dir == LEFT_SIDE) {
+		return coord_is_before_leftmost(coord);
+	} else {
+		return coord_is_after_rightmost(coord);
+	}
+}
+
+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
+/* Audited by: green(2002.06.15) */
+int coord_sideof_unit(coord_t * coord, sideof dir)
+{
+	assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+	if (dir == LEFT_SIDE) {
+		return coord_prev_unit(coord);
+	} else {
+		return coord_next_unit(coord);
+	}
+}
+
+#if REISER4_DEBUG
+int coords_equal(const coord_t * c1, const coord_t * c2)
+{
+	assert("nikita-2840", c1 != NULL);
+	assert("nikita-2841", c2 != NULL);
+
+	return
+	    c1->node == c2->node &&
+	    c1->item_pos == c2->item_pos &&
+	    c1->unit_pos == c2->unit_pos && c1->between == c2->between;
+}
+#endif  /*  REISER4_DEBUG  */
+
+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
+   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
+/* Audited by: green(2002.06.15) */
+coord_wrt_node coord_wrt(const coord_t * coord)
+{
+	if (coord_is_before_leftmost(coord)) {
+		return COORD_ON_THE_LEFT;
+	}
+
+	if (coord_is_after_rightmost(coord)) {
+		return COORD_ON_THE_RIGHT;
+	}
+
+	return COORD_INSIDE;
+}
+
+/* Returns true if the coordinate is positioned after the last item or after the last unit
+   of the last item or it is an empty node. */
+/* Audited by: green(2002.06.15) */
+int coord_is_after_rightmost(const coord_t * coord)
+{
+	assert("jmacd-7313", coord_check(coord));
+
+	switch (coord->between) {
+	case INVALID_COORD:
+	case AT_UNIT:
+	case BEFORE_UNIT:
+	case BEFORE_ITEM:
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case AFTER_ITEM:
+		return (coord->item_pos == node_num_items(coord->node) - 1);
+
+	case AFTER_UNIT:
+		return ((coord->item_pos == node_num_items(coord->node) - 1) &&
+			coord->unit_pos == coord_last_unit_pos(coord));
+	}
+
+	impossible("jmacd-9908", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned before the first item or it is an empty
+   node. */
+int coord_is_before_leftmost(const coord_t * coord)
+{
+	/* FIXME-VS: coord_check requires node to be loaded whereas it is not
+	   necessary to check if coord is set before leftmost
+	   assert ("jmacd-7313", coord_check (coord)); */
+	switch (coord->between) {
+	case INVALID_COORD:
+	case AT_UNIT:
+	case AFTER_ITEM:
+	case AFTER_UNIT:
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case BEFORE_ITEM:
+	case BEFORE_UNIT:
+		return (coord->item_pos == 0) && (coord->unit_pos == 0);
+	}
+
+	impossible("jmacd-9908", "unreachable");
+	return 0;
+}
+
+/* Returns true if the coordinate is positioned after a item, before a item, after the
+   last unit of an item, before the first unit of an item, or at an empty node. */
+/* Audited by: green(2002.06.15) */
+int coord_is_between_items(const coord_t * coord)
+{
+	assert("jmacd-7313", coord_check(coord));
+
+	switch (coord->between) {
+	case INVALID_COORD:
+	case AT_UNIT:
+		return 0;
+
+	case AFTER_ITEM:
+	case BEFORE_ITEM:
+	case EMPTY_NODE:
+		return 1;
+
+	case BEFORE_UNIT:
+		return coord->unit_pos == 0;
+
+	case AFTER_UNIT:
+		return coord->unit_pos == coord_last_unit_pos(coord);
+	}
+
+	impossible("jmacd-9908", "unreachable");
+	return 0;
+}
+
+#if REISER4_DEBUG
+/* Returns true if the coordinates are positioned at adjacent units, regardless of
+   before-after or item boundaries. */
+int coord_are_neighbors(coord_t * c1, coord_t * c2)
+{
+	coord_t *left;
+	coord_t *right;
+
+	assert("nikita-1241", c1 != NULL);
+	assert("nikita-1242", c2 != NULL);
+	assert("nikita-1243", c1->node == c2->node);
+	assert("nikita-1244", coord_is_existing_unit(c1));
+	assert("nikita-1245", coord_is_existing_unit(c2));
+
+	left = right = NULL;
+	switch (coord_compare(c1, c2)) {
+	case COORD_CMP_ON_LEFT:
+		left = c1;
+		right = c2;
+		break;
+	case COORD_CMP_ON_RIGHT:
+		left = c2;
+		right = c1;
+		break;
+	case COORD_CMP_SAME:
+		return 0;
+	default:
+		wrong_return_value("nikita-1246", "compare_coords()");
+	}
+	assert("vs-731", left && right);
+	if (left->item_pos == right->item_pos) {
+		return left->unit_pos + 1 == right->unit_pos;
+	} else if (left->item_pos + 1 == right->item_pos) {
+		return (left->unit_pos == coord_last_unit_pos(left))
+		    && (right->unit_pos == 0);
+	} else {
+		return 0;
+	}
+}
+#endif  /*  REISER4_DEBUG  */
+
+/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
+   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
+/* Audited by: green(2002.06.15) */
+coord_cmp coord_compare(coord_t * c1, coord_t * c2)
+{
+	assert("vs-209", c1->node == c2->node);
+	assert("vs-194", coord_is_existing_unit(c1)
+	       && coord_is_existing_unit(c2));
+
+	if (c1->item_pos > c2->item_pos)
+		return COORD_CMP_ON_RIGHT;
+	if (c1->item_pos < c2->item_pos)
+		return COORD_CMP_ON_LEFT;
+	if (c1->unit_pos > c2->unit_pos)
+		return COORD_CMP_ON_RIGHT;
+	if (c1->unit_pos < c2->unit_pos)
+		return COORD_CMP_ON_LEFT;
+	return COORD_CMP_SAME;
+}
+
+/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
+   non-zero if there is no position to the right. */
+int coord_set_to_right(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 1) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+		return 0;
+
+	case BEFORE_ITEM:
+	case BEFORE_UNIT:
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_UNIT:
+		if (coord->unit_pos < coord_last_unit_pos(coord)) {
+			coord->unit_pos += 1;
+			coord->between = AT_UNIT;
+			return 0;
+		} else {
+
+			coord->unit_pos = 0;
+
+			if (coord->item_pos == items - 1) {
+				coord->between = AFTER_ITEM;
+				return 1;
+			}
+
+			coord_inc_item_pos(coord);
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+	case AFTER_ITEM:
+		if (coord->item_pos == items - 1) {
+			return 1;
+		}
+
+		coord_inc_item_pos(coord);
+		coord->unit_pos = 0;
+		coord->between = AT_UNIT;
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case INVALID_COORD:
+		break;
+	}
+
+	impossible("jmacd-9920", "unreachable");
+	return 0;
+}
+
+/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
+   non-zero if there is no position to the left. */
+int coord_set_to_left(coord_t * coord)
+{
+	unsigned items = coord_num_items(coord);
+
+	if (coord_adjust_items(coord, items, 0) == 1) {
+		return 1;
+	}
+
+	switch (coord->between) {
+	case AT_UNIT:
+		return 0;
+
+	case AFTER_UNIT:
+		coord->between = AT_UNIT;
+		return 0;
+
+	case AFTER_ITEM:
+		coord->between = AT_UNIT;
+		coord->unit_pos = coord_last_unit_pos(coord);
+		return 0;
+
+	case BEFORE_UNIT:
+		if (coord->unit_pos > 0) {
+			coord->unit_pos -= 1;
+			coord->between = AT_UNIT;
+			return 0;
+		} else {
+
+			if (coord->item_pos == 0) {
+				coord->between = BEFORE_ITEM;
+				return 1;
+			}
+
+			coord->unit_pos = coord_last_unit_pos(coord);
+			coord_dec_item_pos(coord);
+			coord->between = AT_UNIT;
+			return 0;
+		}
+
+	case BEFORE_ITEM:
+		if (coord->item_pos == 0) {
+			return 1;
+		}
+
+		coord_dec_item_pos(coord);
+		coord->unit_pos = coord_last_unit_pos(coord);
+		coord->between = AT_UNIT;
+		return 0;
+
+	case EMPTY_NODE:
+		return 1;
+
+	case INVALID_COORD:
+		break;
+	}
+
+	impossible("jmacd-9920", "unreachable");
+	return 0;
+}
+
+static const char *coord_tween_tostring(between_enum n)
+{
+	switch (n) {
+	case BEFORE_UNIT:
+		return "before unit";
+	case BEFORE_ITEM:
+		return "before item";
+	case AT_UNIT:
+		return "at unit";
+	case AFTER_UNIT:
+		return "after unit";
+	case AFTER_ITEM:
+		return "after item";
+	case EMPTY_NODE:
+		return "empty node";
+	case INVALID_COORD:
+		return "invalid";
+	default:
+	{
+		static char buf[30];
+
+		sprintf(buf, "unknown: %i", n);
+		return buf;
+	}
+	}
+}
+
+void print_coord(const char *mes, const coord_t * coord, int node)
+{
+	if (coord == NULL) {
+		printk("%s: null\n", mes);
+		return;
+	}
+	printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
+	       mes, coord->item_pos, coord->unit_pos,
+	       coord_tween_tostring(coord->between), coord->iplugid);
+}
+
+int
+item_utmost_child_real_block(const coord_t * coord, sideof side,
+			     reiser4_block_nr * blk)
+{
+	return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
+								      side,
+								      blk);
+}
+
+int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
+{
+	return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
+}
+
+/* @count bytes of flow @f got written, update correspondingly f->length,
+   f->data and f->key */
+void move_flow_forward(flow_t * f, unsigned count)
+{
+	if (f->data)
+		f->data += count;
+	f->length -= count;
+	set_key_offset(&f->key, get_key_offset(&f->key) + count);
+}
+
+/*
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/coord.h linux-2.6.22/fs/reiser4/coord.h
--- linux-2.6.22.orig/fs/reiser4/coord.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/coord.h	2007-07-29 00:25:34.832685088 +0400
@@ -0,0 +1,389 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Coords */
+
+#if !defined( __REISER4_COORD_H__ )
+#define __REISER4_COORD_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+
+/* insertions happen between coords in the tree, so we need some means
+   of specifying the sense of betweenness. */
+typedef enum {
+	BEFORE_UNIT,		/* Note: we/init_coord depends on this value being zero. */
+	AT_UNIT,
+	AFTER_UNIT,
+	BEFORE_ITEM,
+	AFTER_ITEM,
+	INVALID_COORD,
+	EMPTY_NODE,
+} between_enum;
+
+/* location of coord w.r.t. its node */
+typedef enum {
+	COORD_ON_THE_LEFT = -1,
+	COORD_ON_THE_RIGHT = +1,
+	COORD_INSIDE = 0
+} coord_wrt_node;
+
+typedef enum {
+	COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
+} coord_cmp;
+
+struct coord {
+	/* node in a tree */
+	/*  0 */ znode *node;
+
+	/* position of item within node */
+	/*  4 */ pos_in_node_t item_pos;
+	/* position of unit within item */
+	/*  6 */ pos_in_node_t unit_pos;
+	/* optimization: plugin of item is stored in coord_t. Until this was
+	   implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
+	   is invalidated (set to 0xff) on each modification of ->item_pos,
+	   and all such modifications are funneled through coord_*_item_pos()
+	   functions below.
+	 */
+	/*  8 */ char iplugid;
+	/* position of coord w.r.t. to neighboring items and/or units.
+	   Values are taken from &between_enum above.
+	 */
+	/*  9 */ char between;
+	/* padding. It will be added by the compiler anyway to conform to the
+	 * C language alignment requirements. We keep it here to be on the
+	 * safe side and to have a clear picture of the memory layout of this
+	 * structure. */
+	/* 10 */ __u16 pad;
+	/* 12 */ int offset;
+#if REISER4_DEBUG
+	unsigned long plug_v;
+	unsigned long body_v;
+#endif
+};
+
+#define INVALID_PLUGID  ((char)((1 << 8) - 1))
+#define INVALID_OFFSET -1
+
+static inline void coord_clear_iplug(coord_t * coord)
+{
+	assert("nikita-2835", coord != NULL);
+	coord->iplugid = INVALID_PLUGID;
+	coord->offset = INVALID_OFFSET;
+}
+
+static inline int coord_is_iplug_set(const coord_t * coord)
+{
+	assert("nikita-2836", coord != NULL);
+	return coord->iplugid != INVALID_PLUGID;
+}
+
+static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
+{
+	assert("nikita-2478", coord != NULL);
+	coord->item_pos = pos;
+	coord_clear_iplug(coord);
+}
+
+static inline void coord_dec_item_pos(coord_t * coord)
+{
+	assert("nikita-2480", coord != NULL);
+	--coord->item_pos;
+	coord_clear_iplug(coord);
+}
+
+static inline void coord_inc_item_pos(coord_t * coord)
+{
+	assert("nikita-2481", coord != NULL);
+	++coord->item_pos;
+	coord_clear_iplug(coord);
+}
+
+static inline void coord_add_item_pos(coord_t * coord, int delta)
+{
+	assert("nikita-2482", coord != NULL);
+	coord->item_pos += delta;
+	coord_clear_iplug(coord);
+}
+
+static inline void coord_invalid_item_pos(coord_t * coord)
+{
+	assert("nikita-2832", coord != NULL);
+	coord->item_pos = (unsigned short)~0;
+	coord_clear_iplug(coord);
+}
+
+/* Reverse a direction. */
+static inline sideof sideof_reverse(sideof side)
+{
+	return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
+}
+
+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
+
+   "first" and "last"
+   "next" and "prev"
+   "before" and "after"
+   "leftmost" and "rightmost"
+
+   But I think the chosen names are decent the way they are.
+*/
+
+/* COORD INITIALIZERS */
+
+/* Initialize an invalid coordinate. */
+extern void coord_init_invalid(coord_t * coord, const znode * node);
+
+extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to point at the first unit of the first item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+extern void coord_init_first_unit(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to point at the last unit of the last item.  If the node is
+   empty, it is positioned at the EMPTY_NODE. */
+extern void coord_init_last_unit(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to before the first item.  If the node is empty, it is
+   positioned at the EMPTY_NODE. */
+extern void coord_init_before_first_item(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
+   at the EMPTY_NODE. */
+extern void coord_init_after_last_item(coord_t * coord, const znode * node);
+
+/* Initialize a coordinate to after last unit in the item. Coord must be set
+   already to existing item */
+void coord_init_after_item_end(coord_t * coord);
+
+/* Initialize a coordinate to before the item. Coord must be set already to existing item */
+void coord_init_before_item(coord_t *);
+/* Initialize a coordinate to after the item. Coord must be set already to existing item */
+void coord_init_after_item(coord_t *);
+
+/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
+extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
+				   sideof dir);
+
+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
+   it was not clear how actually
+   FIXME-VS: added by vs (2002, june, 8) */
+extern void coord_init_zero(coord_t * coord);
+
+/* COORD METHODS */
+
+/* after shifting of node content, coord previously set properly may become
+   invalid, try to "normalize" it. */
+void coord_normalize(coord_t * coord);
+
+/* Copy a coordinate. */
+extern void coord_dup(coord_t * coord, const coord_t * old_coord);
+
+/* Copy a coordinate without check. */
+void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
+
+unsigned coord_num_units(const coord_t * coord);
+
+/* Return the last valid unit number at the present item (i.e.,
+   coord_num_units() - 1). */
+static inline unsigned coord_last_unit_pos(const coord_t * coord)
+{
+	return coord_num_units(coord) - 1;
+}
+
+#if REISER4_DEBUG
+/* For assertions only, checks for a valid coordinate. */
+extern int coord_check(const coord_t * coord);
+
+extern unsigned long znode_times_locked(const znode * z);
+
+static inline void coord_update_v(coord_t * coord)
+{
+	coord->plug_v = coord->body_v = znode_times_locked(coord->node);
+}
+#endif
+
+extern int coords_equal(const coord_t * c1, const coord_t * c2);
+
+extern void print_coord(const char *mes, const coord_t * coord, int print_node);
+
+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
+   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
+extern coord_wrt_node coord_wrt(const coord_t * coord);
+
+/* Returns true if the coordinates are positioned at adjacent units, regardless of
+   before-after or item boundaries. */
+extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
+
+/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
+   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
+extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
+
+/* COORD PREDICATES */
+
+/* Returns true if the coord was initializewd by coord_init_invalid (). */
+extern int coord_is_invalid(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at an existing item, not before or after
+   an item.  It may be placed at, before, or after any unit within the item, whether
+   existing or not.  If this is true you can call methods of the item plugin.  */
+extern int coord_is_existing_item(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned after a item, before a item, after the
+   last unit of an item, before the first unit of an item, or at an empty node. */
+extern int coord_is_between_items(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at an existing unit, not before or after a
+   unit. */
+extern int coord_is_existing_unit(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at an empty node. */
+extern int coord_is_empty(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
+   true for empty nodes nor coordinates positioned before the first item. */
+extern int coord_is_leftmost_unit(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned after the last item or after the last unit
+   of the last item or it is an empty node. */
+extern int coord_is_after_rightmost(const coord_t * coord);
+
+/* Returns true if the coordinate is positioned before the first item or it is an empty
+   node. */
+extern int coord_is_before_leftmost(const coord_t * coord);
+
+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
+   argument. */
+extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
+
+/* COORD MODIFIERS */
+
+/* Advances the coordinate by one unit to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
+   an existing unit. */
+extern int coord_next_unit(coord_t * coord);
+
+/* Advances the coordinate by one item to the right.  If empty, no change.  If
+   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
+   an existing item. */
+extern int coord_next_item(coord_t * coord);
+
+/* Advances the coordinate by one unit to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing unit. */
+extern int coord_prev_unit(coord_t * coord);
+
+/* Advances the coordinate by one item to the left.  If empty, no change.  If
+   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
+   is an existing item. */
+extern int coord_prev_item(coord_t * coord);
+
+/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
+   non-zero if there is no position to the right. */
+extern int coord_set_to_right(coord_t * coord);
+
+/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
+   non-zero if there is no position to the left. */
+extern int coord_set_to_left(coord_t * coord);
+
+/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
+   and non-zero if the unit did not exist. */
+extern int coord_set_after_unit(coord_t * coord);
+
+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
+extern int coord_sideof_unit(coord_t * coord, sideof dir);
+
+/* iterate over all units in @node */
+#define for_all_units( coord, node )					\
+	for( coord_init_before_first_item( ( coord ), ( node ) ) ; 	\
+	     coord_next_unit( coord ) == 0 ; )
+
+/* iterate over all items in @node */
+#define for_all_items( coord, node )					\
+	for( coord_init_before_first_item( ( coord ), ( node ) ) ; 	\
+	     coord_next_item( coord ) == 0 ; )
+
+/* COORD/ITEM METHODS */
+
+extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
+					reiser4_block_nr * blk);
+extern int item_utmost_child(const coord_t * coord, sideof side,
+			     jnode ** child);
+
+/* a flow is a sequence of bytes being written to or read from the tree.  The
+   tree will slice the flow into items while storing it into nodes, but all of
+   that is hidden from anything outside the tree.  */
+
+struct flow {
+	reiser4_key key;	/* key of start of flow's sequence of bytes */
+	loff_t length;		/* length of flow's sequence of bytes */
+	char *data;	        /* start of flow's sequence of bytes */
+	int user;		/* if 1 data is user space, 0 - kernel space */
+	rw_op op;		/* NIKITA-FIXME-HANS: comment is where?  */
+};
+
+void move_flow_forward(flow_t * f, unsigned count);
+
+/* &reiser4_item_data - description of data to be inserted or pasted
+
+   Q: articulate the reasons for the difference between this and flow.
+
+   A: Becides flow we insert into tree other things: stat data, directory
+   entry, etc.  To insert them into tree one has to provide this structure. If
+   one is going to insert flow - he can use insert_flow, where this structure
+   does not have to be created
+*/
+struct reiser4_item_data {
+	/* actual data to be inserted. If NULL, ->create_item() will not
+	   do xmemcpy itself, leaving this up to the caller. This can
+	   save some amount of unnecessary memory copying, for example,
+	   during insertion of stat data.
+
+	 */
+	char *data;
+	/* 1 if 'char * data' contains pointer to user space and 0 if it is
+	   kernel space */
+	int user;
+	/* amount of data we are going to insert or paste */
+	int length;
+	/* "Arg" is opaque data that is passed down to the
+	   ->create_item() method of node layout, which in turn
+	   hands it to the ->create_hook() of item being created. This
+	   arg is currently used by:
+
+	   .  ->create_hook() of internal item
+	   (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
+	   . ->paste() method of directory item.
+	   . ->create_hook() of extent item
+
+	   For internal item, this is left "brother" of new node being
+	   inserted and it is used to add new node into sibling list
+	   after parent to it was just inserted into parent.
+
+	   While ->arg does look somewhat of unnecessary compication,
+	   it actually saves a lot of headache in many places, because
+	   all data necessary to insert or paste new data into tree are
+	   collected in one place, and this eliminates a lot of extra
+	   argument passing and storing everywhere.
+
+	 */
+	void *arg;
+	/* plugin of item we are inserting */
+	item_plugin *iplug;
+};
+
+/* __REISER4_COORD_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/debug.c linux-2.6.22/fs/reiser4/debug.c
--- linux-2.6.22.orig/fs/reiser4/debug.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/debug.c	2007-07-29 00:25:34.836686123 +0400
@@ -0,0 +1,308 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Debugging facilities. */
+
+/*
+ * This file contains generic debugging functions used by reiser4. Roughly
+ * following:
+ *
+ *     panicking: reiser4_do_panic(), reiser4_print_prefix().
+ *
+ *     locking:
+ *     reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
+ *     reiser4_no_counters_are_held(), reiser4_commit_check_locks()
+ *
+ *     error code monitoring (see comment before RETERR macro):
+ *     reiser4_return_err(), reiser4_report_err().
+ *
+ *     stack back-tracing: fill_backtrace()
+ *
+ *     miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
+ *     reiser4_debugtrap().
+ *
+ */
+
+#include "reiser4.h"
+#include "context.h"
+#include "super.h"
+#include "txnmgr.h"
+#include "znode.h"
+
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/vmalloc.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/hardirq.h>
+
+#if 0
+#if REISER4_DEBUG
+static void reiser4_report_err(void);
+#else
+#define reiser4_report_err() noop
+#endif
+#endif  /*  0  */
+
+/*
+ * global buffer where message given to reiser4_panic is formatted.
+ */
+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
+
+/*
+ * lock protecting consistency of panic_buf under concurrent panics
+ */
+static DEFINE_SPINLOCK(panic_guard);
+
+/* Your best friend. Call it on each occasion.  This is called by
+    fs/reiser4/debug.h:reiser4_panic(). */
+void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
+{
+	static int in_panic = 0;
+	va_list args;
+
+	/*
+	 * check for recursive panic.
+	 */
+	if (in_panic == 0) {
+		in_panic = 1;
+
+		spin_lock(&panic_guard);
+		va_start(args, format);
+		vsnprintf(panic_buf, sizeof(panic_buf), format, args);
+		va_end(args);
+		printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
+		spin_unlock(&panic_guard);
+
+		/*
+		 * if kernel debugger is configured---drop in. Early dropping
+		 * into kgdb is not always convenient, because panic message
+		 * is not yet printed most of the times. But:
+		 *
+		 *     (1) message can be extracted from printk_buf[]
+		 *     (declared static inside of printk()), and
+		 *
+		 *     (2) sometimes serial/kgdb combo dies while printing
+		 *     long panic message, so it's more prudent to break into
+		 *     debugger earlier.
+		 *
+		 */
+		DEBUGON(1);
+	}
+	/* to make gcc happy about noreturn attribute */
+	panic("%s", panic_buf);
+}
+
+#if 0
+void
+reiser4_print_prefix(const char *level, int reperr, const char *mid,
+		     const char *function, const char *file, int lineno)
+{
+	const char *comm;
+	int pid;
+
+	if (unlikely(in_interrupt() || in_irq())) {
+		comm = "interrupt";
+		pid = 0;
+	} else {
+		comm = current->comm;
+		pid = current->pid;
+	}
+	printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
+	       level, comm, pid, function, file, lineno, mid);
+	if (reperr)
+		reiser4_report_err();
+}
+#endif  /*  0  */
+
+/* Preemption point: this should be called periodically during long running
+   operations (carry, allocate, and squeeze are best examples) */
+int reiser4_preempt_point(void)
+{
+	assert("nikita-3008", reiser4_schedulable());
+	cond_resched();
+	return signal_pending(current);
+}
+
+#if REISER4_DEBUG
+/* Debugging aid: return struct where information about locks taken by current
+   thread is accumulated. This can be used to formulate lock ordering
+   constraints and various assertions.
+
+*/
+reiser4_lock_cnt_info *reiser4_lock_counters(void)
+{
+	reiser4_context *ctx = get_current_context();
+	assert("jmacd-1123", ctx != NULL);
+	return &ctx->locks;
+}
+
+/*
+ * print human readable information about locks held by the reiser4 context.
+ */
+static void print_lock_counters(const char *prefix,
+				const reiser4_lock_cnt_info * info)
+{
+	printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
+	       "jload: %i, "
+	       "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
+	       "ktxnmgrd: %i, fq: %i\n"
+	       "inode: %i, "
+	       "cbk_cache: %i (r:%i,w%i), "
+	       "eflush: %i, "
+	       "zlock: %i,\n"
+	       "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
+	       "d: %i, x: %i, t: %i\n", prefix,
+	       info->spin_locked_jnode,
+	       info->rw_locked_tree, info->read_locked_tree,
+	       info->write_locked_tree,
+	       info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
+	       info->spin_locked_jload,
+	       info->spin_locked_txnh,
+	       info->spin_locked_atom, info->spin_locked_stack,
+	       info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
+	       info->spin_locked_fq,
+	       info->spin_locked_inode,
+	       info->rw_locked_cbk_cache,
+	       info->read_locked_cbk_cache,
+	       info->write_locked_cbk_cache,
+	       info->spin_locked_super_eflush,
+	       info->spin_locked_zlock,
+	       info->spin_locked,
+	       info->long_term_locked_znode,
+	       info->inode_sem_r, info->inode_sem_w,
+	       info->d_refs, info->x_refs, info->t_refs);
+}
+
+/* check that no spinlocks are held */
+int reiser4_schedulable(void)
+{
+	if (get_current_context_check() != NULL) {
+		if (!LOCK_CNT_NIL(spin_locked)) {
+			print_lock_counters("in atomic", reiser4_lock_counters());
+			return 0;
+		}
+	}
+	might_sleep();
+	return 1;
+}
+/*
+ * return true, iff no locks are held.
+ */
+int reiser4_no_counters_are_held(void)
+{
+	reiser4_lock_cnt_info *counters;
+
+	counters = reiser4_lock_counters();
+	return
+	    (counters->spin_locked_zlock == 0) &&
+	    (counters->spin_locked_jnode == 0) &&
+	    (counters->rw_locked_tree == 0) &&
+	    (counters->read_locked_tree == 0) &&
+	    (counters->write_locked_tree == 0) &&
+	    (counters->rw_locked_dk == 0) &&
+	    (counters->read_locked_dk == 0) &&
+	    (counters->write_locked_dk == 0) &&
+	    (counters->spin_locked_txnh == 0) &&
+	    (counters->spin_locked_atom == 0) &&
+	    (counters->spin_locked_stack == 0) &&
+	    (counters->spin_locked_txnmgr == 0) &&
+	    (counters->spin_locked_inode == 0) &&
+	    (counters->spin_locked == 0) &&
+	    (counters->long_term_locked_znode == 0) &&
+	    (counters->inode_sem_r == 0) &&
+	    (counters->inode_sem_w == 0) && (counters->d_refs == 0);
+}
+
+/*
+ * return true, iff transaction commit can be done under locks held by the
+ * current thread.
+ */
+int reiser4_commit_check_locks(void)
+{
+	reiser4_lock_cnt_info *counters;
+	int inode_sem_r;
+	int inode_sem_w;
+	int result;
+
+	/*
+	 * inode's read/write semaphore is the only reiser4 lock that can be
+	 * held during commit.
+	 */
+
+	counters = reiser4_lock_counters();
+	inode_sem_r = counters->inode_sem_r;
+	inode_sem_w = counters->inode_sem_w;
+
+	counters->inode_sem_r = counters->inode_sem_w = 0;
+	result = reiser4_no_counters_are_held();
+	counters->inode_sem_r = inode_sem_r;
+	counters->inode_sem_w = inode_sem_w;
+	return result;
+}
+
+/*
+ * fill "error site" in the current reiser4 context. See comment before RETERR
+ * macro for more details.
+ */
+void reiser4_return_err(int code, const char *file, int line)
+{
+	if (code < 0 && is_in_reiser4_context()) {
+		reiser4_context *ctx = get_current_context();
+
+		if (ctx != NULL) {
+			ctx->err.code = code;
+			ctx->err.file = file;
+			ctx->err.line = line;
+		}
+	}
+}
+
+#if 0
+/*
+ * report error information recorder by reiser4_return_err().
+ */
+static void reiser4_report_err(void)
+{
+	reiser4_context *ctx = get_current_context_check();
+
+	if (ctx != NULL) {
+		if (ctx->err.code != 0) {
+			printk("code: %i at %s:%i\n",
+			       ctx->err.code, ctx->err.file, ctx->err.line);
+		}
+	}
+}
+#endif  /*  0  */
+
+#endif				/* REISER4_DEBUG */
+
+#if KERNEL_DEBUGGER
+
+/*
+ * this functions just drops into kernel debugger. It is a convenient place to
+ * put breakpoint in.
+ */
+void reiser4_debugtrap(void)
+{
+	/* do nothing. Put break point here. */
+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
+	extern void breakpoint(void);
+	breakpoint();
+#endif
+}
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/debug.h linux-2.6.22/fs/reiser4/debug.h
--- linux-2.6.22.orig/fs/reiser4/debug.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/debug.h	2007-07-29 00:25:34.836686123 +0400
@@ -0,0 +1,350 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Declarations of debug macros. */
+
+#if !defined( __FS_REISER4_DEBUG_H__ )
+#define __FS_REISER4_DEBUG_H__
+
+#include "forward.h"
+#include "reiser4.h"
+
+/* generic function to produce formatted output, decorating it with
+   whatever standard prefixes/postfixes we want. "Fun" is a function
+   that will be actually called, can be printk, panic etc.
+   This is for use by other debugging macros, not by users. */
+#define DCALL(lev, fun, reperr, label, format, ...)			\
+({									\
+	fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,	\
+	    current->comm, current->pid, __FUNCTION__,			\
+	    __FILE__, __LINE__, label, ## __VA_ARGS__);			\
+})
+
+/*
+ * cause kernel to crash
+ */
+#define reiser4_panic(mid, format, ...)				\
+	DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
+
+/* print message with indication of current process, file, line and
+   function */
+#define reiser4_log(label, format, ...) 				\
+	DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
+
+/* Assertion checked during compilation.
+    If "cond" is false (0) we get duplicate case label in switch.
+    Use this to check something like famous
+       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
+    in 3.x journal.c. If cassertion fails you get compiler error,
+    so no "maintainer-id".
+*/
+#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
+
+#define noop   do {;} while(0)
+
+#if REISER4_DEBUG
+/* version of info that only actually prints anything when _d_ebugging
+    is on */
+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
+/* macro to catch logical errors. Put it into `default' clause of
+    switch() statement. */
+#define impossible(label, format, ...) 			\
+         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
+/* assert assures that @cond is true. If it is not, reiser4_panic() is
+   called. Use this for checking logical consistency and _never_ call
+   this to check correctness of external data: disk blocks and user-input . */
+#define assert(label, cond)							\
+({										\
+	/* call_on_each_assert(); */						\
+	if (cond) {								\
+		/* put negated check to avoid using !(cond) that would lose	\
+		 * warnings for things like assert(a = b); */			\
+		;								\
+	} else {								\
+		DEBUGON(1);							\
+		reiser4_panic(label, "assertion failed: %s", #cond);		\
+	}									\
+})
+
+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
+#define check_me( label, expr )	assert( label, ( expr ) )
+
+#define ON_DEBUG( exp ) exp
+
+extern int reiser4_schedulable(void);
+extern void call_on_each_assert(void);
+
+#else
+
+#define dinfo( format, args... ) noop
+#define impossible( label, format, args... ) noop
+#define assert( label, cond ) noop
+#define check_me( label, expr )	( ( void ) ( expr ) )
+#define ON_DEBUG( exp )
+#define reiser4_schedulable() might_sleep()
+
+/* REISER4_DEBUG */
+#endif
+
+#if REISER4_DEBUG
+/* per-thread information about lock acquired by this thread. Used by lock
+ * ordering checking in spin_macros.h */
+typedef struct reiser4_lock_cnt_info {
+	int rw_locked_tree;
+	int read_locked_tree;
+	int write_locked_tree;
+
+	int rw_locked_dk;
+	int read_locked_dk;
+	int write_locked_dk;
+
+	int rw_locked_cbk_cache;
+	int read_locked_cbk_cache;
+	int write_locked_cbk_cache;
+
+	int spin_locked_zlock;
+	int spin_locked_jnode;
+	int spin_locked_jload;
+	int spin_locked_txnh;
+	int spin_locked_atom;
+	int spin_locked_stack;
+	int spin_locked_txnmgr;
+	int spin_locked_ktxnmgrd;
+	int spin_locked_fq;
+	int spin_locked_inode;
+	int spin_locked_super_eflush;
+	int spin_locked;
+	int long_term_locked_znode;
+
+	int inode_sem_r;
+	int inode_sem_w;
+
+	int d_refs;
+	int x_refs;
+	int t_refs;
+} reiser4_lock_cnt_info;
+
+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
+
+/* increment lock-counter @counter, if present */
+#define LOCK_CNT_INC(counter)					\
+	IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
+
+/* decrement lock-counter @counter, if present */
+#define LOCK_CNT_DEC(counter)					\
+	IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
+
+/* check that lock-counter is zero. This is for use in assertions */
+#define LOCK_CNT_NIL(counter)					\
+	IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
+
+/* check that lock-counter is greater than zero. This is for use in
+ * assertions */
+#define LOCK_CNT_GTZ(counter)					\
+	IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
+#define LOCK_CNT_LT(counter,n)					\
+	IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
+
+#else				/* REISER4_DEBUG */
+
+/* no-op versions on the above */
+
+typedef struct reiser4_lock_cnt_info {
+} reiser4_lock_cnt_info;
+
+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
+#define LOCK_CNT_INC(counter) noop
+#define LOCK_CNT_DEC(counter) noop
+#define LOCK_CNT_NIL(counter) (1)
+#define LOCK_CNT_GTZ(counter) (1)
+#define LOCK_CNT_LT(counter,n) (1)
+
+#endif				/* REISER4_DEBUG */
+
+#define assert_spin_not_locked(lock) BUG_ON(0)
+#define assert_rw_write_locked(lock) BUG_ON(0)
+#define assert_rw_read_locked(lock) BUG_ON(0)
+#define assert_rw_locked(lock) BUG_ON(0)
+#define assert_rw_not_write_locked(lock) BUG_ON(0)
+#define assert_rw_not_read_locked(lock) BUG_ON(0)
+#define assert_rw_not_locked(lock) BUG_ON(0)
+
+/* flags controlling debugging behavior. Are set through debug_flags=N mount
+   option. */
+typedef enum {
+	/* print a lot of information during panic. When this is on all jnodes
+	 * are listed. This can be *very* large output. Usually you don't want
+	 * this. Especially over serial line. */
+	REISER4_VERBOSE_PANIC = 0x00000001,
+	/* print a lot of information during umount */
+	REISER4_VERBOSE_UMOUNT = 0x00000002,
+	/* print gathered statistics on umount */
+	REISER4_STATS_ON_UMOUNT = 0x00000004,
+	/* check node consistency */
+	REISER4_CHECK_NODE = 0x00000008
+} reiser4_debug_flags;
+
+extern int is_in_reiser4_context(void);
+
+/*
+ * evaluate expression @e only if with reiser4 context
+ */
+#define ON_CONTEXT(e)	do {			\
+	if(is_in_reiser4_context()) {		\
+		e;				\
+	} } while(0)
+
+/*
+ * evaluate expression @e only when within reiser4_context and debugging is
+ * on.
+ */
+#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
+
+/*
+ * complain about unexpected function result and crash. Used in "default"
+ * branches of switch statements and alike to assert that invalid results are
+ * not silently ignored.
+ */
+#define wrong_return_value( label, function )				\
+	impossible( label, "wrong return value from " function )
+
+/* Issue different types of reiser4 messages to the console */
+#define warning( label, format, ... )					\
+	DCALL( KERN_WARNING, 						\
+	       printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
+#define notice( label, format, ... )					\
+	DCALL( KERN_NOTICE, 						\
+	       printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
+
+/* mark not yet implemented functionality */
+#define not_yet( label, format, ... )				\
+	reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
+
+extern void reiser4_do_panic(const char *format, ...)
+    __attribute__ ((noreturn, format(printf, 1, 2)));
+
+extern int reiser4_preempt_point(void);
+extern void reiser4_print_stats(void);
+
+#if REISER4_DEBUG
+extern int reiser4_no_counters_are_held(void);
+extern int reiser4_commit_check_locks(void);
+#else
+#define reiser4_no_counters_are_held() (1)
+#define reiser4_commit_check_locks() (1)
+#endif
+
+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
+#define IS_POW(i) 				\
+({						\
+	typeof(i) __i;				\
+						\
+	__i = (i);				\
+	!(__i & (__i - 1));			\
+})
+
+#define KERNEL_DEBUGGER (1)
+
+#if KERNEL_DEBUGGER
+
+extern void reiser4_debugtrap(void);
+
+/*
+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
+ * kgdb is not compiled in, do nothing.
+ */
+#define DEBUGON(cond)					\
+({	               					\
+	if (unlikely(cond))				\
+		reiser4_debugtrap();			\
+})
+#else
+#define DEBUGON(cond) noop
+#endif
+
+/*
+ * Error code tracing facility. (Idea is borrowed from XFS code.)
+ *
+ * Suppose some strange and/or unexpected code is returned from some function
+ * (for example, write(2) returns -EEXIST). It is possible to place a
+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
+ * in what particular place -EEXIST was generated first?
+ *
+ * In reiser4 all places where actual error codes are produced (that is,
+ * statements of the form
+ *
+ *     return -EFOO;        // (1), or
+ *
+ *     result = -EFOO;      // (2)
+ *
+ * are replaced with
+ *
+ *     return RETERR(-EFOO);        // (1a), and
+ *
+ *     result = RETERR(-EFOO);      // (2a) respectively
+ *
+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
+ * printed in error and warning messages. Moreover, it's possible to put a
+ * conditional breakpoint in reiser4_return_err (low-level function called
+ * by RETERR() to do the actual work) to break into debugger immediately
+ * when particular error happens.
+ *
+ */
+
+#if REISER4_DEBUG
+
+/*
+ * data-type to store information about where error happened ("error site").
+ */
+typedef struct err_site {
+	int code;		/* error code */
+	const char *file;	/* source file, filled by __FILE__ */
+	int line;		/* source file line, filled by __LINE__ */
+} err_site;
+
+extern void reiser4_return_err(int code, const char *file, int line);
+
+/*
+ * fill &get_current_context()->err_site with error information.
+ */
+#define RETERR(code)					\
+({						        \
+	typeof(code) __code;				\
+							\
+	__code = (code);				\
+	reiser4_return_err(__code, __FILE__, __LINE__);	\
+	__code;						\
+})
+
+#else
+
+/*
+ * no-op versions of the above
+ */
+
+typedef struct err_site {
+} err_site;
+#define RETERR(code) code
+#endif
+
+#if REISER4_LARGE_KEY
+/*
+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
+ */
+#define ON_LARGE_KEY(...) __VA_ARGS__
+#else
+#define ON_LARGE_KEY(...)
+#endif
+
+/* __FS_REISER4_DEBUG_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/dformat.h linux-2.6.22/fs/reiser4/dformat.h
--- linux-2.6.22.orig/fs/reiser4/dformat.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/dformat.h	2007-07-29 00:25:34.836686123 +0400
@@ -0,0 +1,70 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Formats of on-disk data and conversion functions. */
+
+/* put all item formats in the files describing the particular items,
+   our model is, everything you need to do to add an item to reiser4,
+   (excepting the changes to the plugin that uses the item which go
+   into the file defining that plugin), you put into one file. */
+/* Data on disk are stored in little-endian format.
+   To declare fields of on-disk structures, use d8, d16, d32 and d64.
+   d??tocpu() and cputod??() to convert. */
+
+#if !defined( __FS_REISER4_DFORMAT_H__ )
+#define __FS_REISER4_DFORMAT_H__
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/types.h>
+
+typedef __u8 d8;
+typedef __le16 d16;
+typedef __le32 d32;
+typedef __le64 d64;
+
+#define PACKED __attribute__((packed))
+
+/* data-type for block number */
+typedef __u64 reiser4_block_nr;
+
+/* data-type for block number on disk, disk format */
+typedef __le64 reiser4_dblock_nr;
+
+/**
+ * disk_addr_eq - compare disk addresses
+ * @b1: pointer to block number ot compare
+ * @b2: pointer to block number ot compare
+ *
+ * Returns true if if disk addresses are the same
+ */
+static inline int disk_addr_eq(const reiser4_block_nr *b1,
+			       const reiser4_block_nr * b2)
+{
+	assert("nikita-1033", b1 != NULL);
+	assert("nikita-1266", b2 != NULL);
+
+	return !memcmp(b1, b2, sizeof *b1);
+}
+
+/* structure of master reiser4 super block */
+typedef struct reiser4_master_sb {
+	char magic[16];		/* "ReIsEr4" */
+	__le16 disk_plugin_id;	/* id of disk layout plugin */
+	__le16 blocksize;
+	char uuid[16];		/* unique id */
+	char label[16];		/* filesystem label */
+	__le64 diskmap;		/* location of the diskmap. 0 if not present */
+} reiser4_master_sb;
+
+/* __FS_REISER4_DFORMAT_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/dscale.c linux-2.6.22/fs/reiser4/dscale.c
--- linux-2.6.22.orig/fs/reiser4/dscale.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/dscale.c	2007-07-29 00:25:34.836686123 +0400
@@ -0,0 +1,174 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Scalable on-disk integers */
+
+/*
+ * Various on-disk structures contain integer-like structures. Stat-data
+ * contain [yes, "data" is plural, check the dictionary] file size, link
+ * count; extent unit contains extent width etc. To accommodate for general
+ * case enough space is reserved to keep largest possible value. 64 bits in
+ * all cases above. But in overwhelming majority of cases numbers actually
+ * stored in these fields will be comparatively small and reserving 8 bytes is
+ * a waste of precious disk bandwidth.
+ *
+ * Scalable integers are one way to solve this problem. dscale_write()
+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
+ * depending on the magnitude of the value supplied. dscale_read() reads value
+ * previously stored by dscale_write().
+ *
+ * dscale_write() produces format not completely unlike of UTF: two highest
+ * bits of the first byte are used to store "tag". One of 4 possible tag
+ * values is chosen depending on the number being encoded:
+ *
+ *           0 ... 0x3f               => 0           [table 1]
+ *        0x40 ... 0x3fff             => 1
+ *      0x4000 ... 0x3fffffff         => 2
+ *  0x40000000 ... 0xffffffffffffffff => 3
+ *
+ * (see dscale_range() function)
+ *
+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
+ * to be stored, so in this case there is no place in the first byte to store
+ * tag. For such values tag is stored in an extra 9th byte.
+ *
+ * As _highest_ bits are used for the test (which is natural) scaled integers
+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
+ * uses LITTLE-ENDIAN.
+ *
+ */
+
+#include "debug.h"
+#include "dscale.h"
+
+/* return tag of scaled integer stored at @address */
+static int gettag(const unsigned char *address)
+{
+	/* tag is stored in two highest bits */
+	return (*address) >> 6;
+}
+
+/* clear tag from value. Clear tag embedded into @value. */
+static void cleartag(__u64 * value, int tag)
+{
+	/*
+	 * W-w-what ?!
+	 *
+	 * Actually, this is rather simple: @value passed here was read by
+	 * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
+	 * zeroes. Tag is still stored in the highest (arithmetically)
+	 * non-zero bits of @value, but relative position of tag within __u64
+	 * depends on @tag.
+	 *
+	 * For example if @tag is 0, it's stored 2 highest bits of lowest
+	 * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
+	 *
+	 * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
+	 * and it's offset if (2 * 8) - 2 == 14 bits.
+	 *
+	 * See table 1 above for details.
+	 *
+	 * All these cases are captured by the formula:
+	 */
+	*value &= ~(3 << (((1 << tag) << 3) - 2));
+	/*
+	 * That is, clear two (3 == 0t11) bits at the offset
+	 *
+	 *                  8 * (2 ^ tag) - 2,
+	 *
+	 * that is, two highest bits of (2 ^ tag)-th byte of @value.
+	 */
+}
+
+/* return tag for @value. See table 1 above for details. */
+static int dscale_range(__u64 value)
+{
+	if (value > 0x3fffffff)
+		return 3;
+	if (value > 0x3fff)
+		return 2;
+	if (value > 0x3f)
+		return 1;
+	return 0;
+}
+
+/* restore value stored at @adderss by dscale_write() and return number of
+ * bytes consumed */
+int dscale_read(unsigned char *address, __u64 * value)
+{
+	int tag;
+
+	/* read tag */
+	tag = gettag(address);
+	switch (tag) {
+	case 3:
+		/* In this case tag is stored in an extra byte, skip this byte
+		 * and decode value stored in the next 8 bytes.*/
+		*value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
+		/* worst case: 8 bytes for value itself plus one byte for
+		 * tag. */
+		return 9;
+	case 0:
+		*value = get_unaligned(address);
+		break;
+	case 1:
+		*value = __be16_to_cpu(get_unaligned((__be16 *)address));
+		break;
+	case 2:
+		*value = __be32_to_cpu(get_unaligned((__be32 *)address));
+		break;
+	default:
+		return RETERR(-EIO);
+	}
+	/* clear tag embedded into @value */
+	cleartag(value, tag);
+	/* number of bytes consumed is (2 ^ tag)---see table 1. */
+	return 1 << tag;
+}
+
+/* store @value at @address and return number of bytes consumed */
+int dscale_write(unsigned char *address, __u64 value)
+{
+	int tag;
+	int shift;
+	__be64 v;
+	unsigned char *valarr;
+
+	tag = dscale_range(value);
+	v = __cpu_to_be64(value);
+	valarr = (unsigned char *)&v;
+	shift = (tag == 3) ? 1 : 0;
+	memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
+	*address |= (tag << 6);
+	return shift + (1 << tag);
+}
+
+/* number of bytes required to store @value */
+int dscale_bytes(__u64 value)
+{
+	int bytes;
+
+	bytes = 1 << dscale_range(value);
+	if (bytes == 8)
+		++bytes;
+	return bytes;
+}
+
+/* returns true if @value and @other require the same number of bytes to be
+ * stored. Used by detect when data structure (like stat-data) has to be
+ * expanded or contracted. */
+int dscale_fit(__u64 value, __u64 other)
+{
+	return dscale_range(value) == dscale_range(other);
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/dscale.h linux-2.6.22/fs/reiser4/dscale.h
--- linux-2.6.22.orig/fs/reiser4/dscale.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/dscale.h	2007-07-29 00:25:34.836686123 +0400
@@ -0,0 +1,27 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Scalable on-disk integers. See dscale.h for details. */
+
+#if !defined( __FS_REISER4_DSCALE_H__ )
+#define __FS_REISER4_DSCALE_H__
+
+#include "dformat.h"
+
+extern int dscale_read(unsigned char *address, __u64 * value);
+extern int dscale_write(unsigned char *address, __u64 value);
+extern int dscale_bytes(__u64 value);
+extern int dscale_fit(__u64 value, __u64 other);
+
+/* __FS_REISER4_DSCALE_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/entd.c linux-2.6.22/fs/reiser4/entd.c
--- linux-2.6.22.orig/fs/reiser4/entd.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/entd.c	2007-07-29 00:25:34.840687159 +0400
@@ -0,0 +1,335 @@
+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Ent daemon. */
+
+#include "debug.h"
+#include "txnmgr.h"
+#include "tree.h"
+#include "entd.h"
+#include "super.h"
+#include "context.h"
+#include "reiser4.h"
+#include "vfs_ops.h"
+#include "page_cache.h"
+#include "inode.h"
+
+#include <linux/sched.h>	/* struct task_struct */
+#include <linux/suspend.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/time.h>		/* INITIAL_JIFFIES */
+#include <linux/backing-dev.h>	/* bdi_write_congested */
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#define DEF_PRIORITY 12
+#define MAX_ENTD_ITERS 10
+
+static void entd_flush(struct super_block *, struct wbq *);
+static int entd(void *arg);
+
+/*
+ * set ->comm field of end thread to make its state visible to the user level
+ */
+#define entd_set_comm(state)					\
+	snprintf(current->comm, sizeof(current->comm),	\
+	         "ent:%s%s", super->s_id, (state))
+
+/**
+ * reiser4_init_entd - initialize entd context and start kernel daemon
+ * @super: super block to start ent thread for
+ *
+ * Creates entd contexts, starts kernel thread and waits until it
+ * initializes.
+ */
+int reiser4_init_entd(struct super_block *super)
+{
+	entd_context *ctx;
+
+	assert("nikita-3104", super != NULL);
+
+	ctx = get_entd_context(super);
+
+	memset(ctx, 0, sizeof *ctx);
+	spin_lock_init(&ctx->guard);
+	init_waitqueue_head(&ctx->wait);
+#if REISER4_DEBUG
+	INIT_LIST_HEAD(&ctx->flushers_list);
+#endif
+	/* lists of writepage requests */
+	INIT_LIST_HEAD(&ctx->todo_list);
+	INIT_LIST_HEAD(&ctx->done_list);
+	/* start entd */
+	ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
+	if (IS_ERR(ctx->tsk))
+		return PTR_ERR(ctx->tsk);
+	return 0;
+}
+
+static void put_wbq(struct wbq *rq)
+{
+	iput(rq->mapping->host);
+	complete(&rq->completion);
+}
+
+/* ent should be locked */
+static struct wbq *__get_wbq(entd_context * ent)
+{
+	struct wbq *wbq;
+
+	if (list_empty(&ent->todo_list))
+		return NULL;
+
+	ent->nr_todo_reqs --;
+	wbq = list_entry(ent->todo_list.next, struct wbq, link);
+	list_del_init(&wbq->link);
+	return wbq;
+}
+
+/* ent thread function */
+static int entd(void *arg)
+{
+	struct super_block *super;
+	entd_context *ent;
+	int done = 0;
+
+	super = arg;
+	/* do_fork() just copies task_struct into the new
+	   thread. ->fs_context shouldn't be copied of course. This shouldn't
+	   be a problem for the rest of the code though.
+	 */
+	current->journal_info = NULL;
+
+	ent = get_entd_context(super);
+
+	while (!done) {
+		try_to_freeze();
+
+		spin_lock(&ent->guard);
+		while (ent->nr_todo_reqs != 0) {
+			struct wbq *rq;
+
+			assert("", list_empty(&ent->done_list));
+
+			/* take request from the queue head */
+			rq = __get_wbq(ent);
+			assert("", rq != NULL);
+			ent->cur_request = rq;
+			spin_unlock(&ent->guard);
+
+			entd_set_comm("!");
+			entd_flush(super, rq);
+
+			put_wbq(rq);
+
+			/*
+			 * wakeup all requestors and iput their inodes
+			 */
+			spin_lock(&ent->guard);
+			while (!list_empty(&ent->done_list)) {
+				rq = list_entry(ent->done_list.next, struct wbq, link);
+				list_del_init(&rq->link);
+				ent->nr_done_reqs --;
+				spin_unlock(&ent->guard);
+				assert("", rq->written == 1);
+				put_wbq(rq);
+				spin_lock(&ent->guard);
+			}
+		}
+		spin_unlock(&ent->guard);
+
+		entd_set_comm(".");
+
+		{
+			DEFINE_WAIT(__wait);
+
+			do {
+				prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
+				if (kthread_should_stop()) {
+					done = 1;
+					break;
+				}
+				if (ent->nr_todo_reqs != 0)
+					break;
+				schedule();
+			} while (0);
+			finish_wait(&ent->wait, &__wait);
+		}
+	}
+	BUG_ON(ent->nr_todo_reqs != 0);
+	return 0;
+}
+
+/**
+ * reiser4_done_entd - stop entd kernel thread
+ * @super: super block to stop ent thread for
+ *
+ * It is called on umount. Sends stop signal to entd and wait until it handles
+ * it.
+ */
+void reiser4_done_entd(struct super_block *super)
+{
+	entd_context *ent;
+
+	assert("nikita-3103", super != NULL);
+
+	ent = get_entd_context(super);
+	assert("zam-1055", ent->tsk != NULL);
+	kthread_stop(ent->tsk);
+}
+
+/* called at the beginning of jnode_flush to register flusher thread with ent
+ * daemon */
+void reiser4_enter_flush(struct super_block *super)
+{
+	entd_context *ent;
+
+	assert("zam-1029", super != NULL);
+	ent = get_entd_context(super);
+
+	assert("zam-1030", ent != NULL);
+
+	spin_lock(&ent->guard);
+	ent->flushers++;
+#if REISER4_DEBUG
+	list_add(&get_current_context()->flushers_link, &ent->flushers_list);
+#endif
+	spin_unlock(&ent->guard);
+}
+
+/* called at the end of jnode_flush */
+void reiser4_leave_flush(struct super_block *super)
+{
+	entd_context *ent;
+	int wake_up_ent;
+
+	assert("zam-1027", super != NULL);
+	ent = get_entd_context(super);
+
+	assert("zam-1028", ent != NULL);
+
+	spin_lock(&ent->guard);
+	ent->flushers--;
+	wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
+#if REISER4_DEBUG
+	list_del_init(&get_current_context()->flushers_link);
+#endif
+	spin_unlock(&ent->guard);
+	if (wake_up_ent)
+		wake_up(&ent->wait);
+}
+
+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
+
+static void entd_flush(struct super_block *super, struct wbq *rq)
+{
+	reiser4_context ctx;
+	int tmp;
+
+	init_stack_context(&ctx, super);
+	ctx.entd = 1;
+	ctx.gfp_mask = GFP_NOFS;
+
+	rq->wbc->range_start = page_offset(rq->page);
+	rq->wbc->range_end = rq->wbc->range_start +
+		(ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
+	tmp = rq->wbc->nr_to_write;
+	rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
+
+	if (rq->wbc->nr_to_write > 0) {
+		rq->wbc->range_start = 0;
+		rq->wbc->range_end = LLONG_MAX;
+		generic_sync_sb_inodes(super, rq->wbc);
+	}
+	rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
+	reiser4_writeout(super, rq->wbc);
+
+	context_set_commit_async(&ctx);
+	reiser4_exit_context(&ctx);
+}
+
+/**
+ * write_page_by_ent - ask entd thread to flush this page as part of slum
+ * @page: page to be written
+ * @wbc: writeback control passed to reiser4_writepage
+ *
+ * Creates a request, puts it on entd list of requests, wakeups entd if
+ * necessary, waits until entd completes with the request.
+ */
+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
+{
+	struct super_block *sb;
+	struct inode *inode;
+	entd_context *ent;
+	struct wbq rq;
+
+	assert("", PageLocked(page));
+	assert("", page->mapping != NULL);
+
+	sb = page->mapping->host->i_sb;
+	ent = get_entd_context(sb);
+	assert("", ent && ent->done == 0);
+
+	/*
+	 * we are going to unlock page and ask ent thread to write the
+	 * page. Re-dirty page before unlocking so that if ent thread fails to
+	 * write it - it will remain dirty
+	 */
+	reiser4_set_page_dirty_internal(page);
+
+	/*
+	 * pin inode in memory, unlock page, entd_flush will iput. We can not
+	 * iput here becasue we can not allow delete_inode to be called here
+	 */
+	inode = igrab(page->mapping->host);
+	unlock_page(page);
+	if (inode == NULL)
+		/* inode is getting freed */
+		return 0;
+
+	/* init wbq */
+	INIT_LIST_HEAD(&rq.link);
+	rq.magic = WBQ_MAGIC;
+	rq.wbc = wbc;
+	rq.page = page;
+	rq.mapping = inode->i_mapping;
+	rq.node = NULL;
+	rq.written = 0;
+	init_completion(&rq.completion);
+
+	/* add request to entd's list of writepage requests */
+	spin_lock(&ent->guard);
+	ent->nr_todo_reqs++;
+	list_add_tail(&rq.link, &ent->todo_list);
+	if (ent->nr_todo_reqs == 1)
+		wake_up(&ent->wait);
+
+	spin_unlock(&ent->guard);
+
+	/* wait until entd finishes */
+	wait_for_completion(&rq.completion);
+
+	if (rq.written)
+		/* Eventually ENTD has written the page to disk. */
+		return 0;
+	return 0;
+}
+
+int wbq_available(void)
+{
+	struct super_block *sb = reiser4_get_current_sb();
+	entd_context *ent = get_entd_context(sb);
+	return ent->nr_todo_reqs;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/entd.h linux-2.6.22/fs/reiser4/entd.h
--- linux-2.6.22.orig/fs/reiser4/entd.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/entd.h	2007-07-29 00:25:34.840687159 +0400
@@ -0,0 +1,90 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Ent daemon. */
+
+#ifndef __ENTD_H__
+#define __ENTD_H__
+
+#include "context.h"
+
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>	/* for struct task_struct */
+
+#define WBQ_MAGIC 0x7876dc76
+
+/* write-back request. */
+struct wbq {
+	int magic;
+	struct list_head link; /* list head of this list is in entd context */
+	struct writeback_control *wbc;
+	struct page *page;
+	struct address_space *mapping;
+	struct completion completion;
+	jnode *node; /* set if ent thread captured requested page */
+	int written; /* set if ent thread wrote requested page */
+};
+
+/* ent-thread context. This is used to synchronize starting/stopping ent
+ * threads. */
+typedef struct entd_context {
+	 /* wait queue that ent thread waits on for more work. It's
+	  * signaled by write_page_by_ent(). */
+	wait_queue_head_t wait;
+	/* spinlock protecting other fields */
+	spinlock_t guard;
+	/* ent thread */
+	struct task_struct *tsk;
+	/* set to indicate that ent thread should leave. */
+	int done;
+	/* counter of active flushers */
+	int flushers;
+	/*
+	 * when reiser4_writepage asks entd to write a page - it adds struct
+	 * wbq to this list
+	 */
+	struct list_head todo_list;
+	/* number of elements on the above list */
+	int nr_todo_reqs;
+
+	struct wbq *cur_request;
+	/*
+	 * when entd writes a page it moves write-back request from todo_list
+	 * to done_list. This list is used at the end of entd iteration to
+	 * wakeup requestors and iput inodes.
+	 */
+	struct list_head done_list;
+	/* number of elements on the above list */
+	int nr_done_reqs;
+
+#if REISER4_DEBUG
+	/* list of all active flushers */
+	struct list_head flushers_list;
+#endif
+} entd_context;
+
+extern int  reiser4_init_entd(struct super_block *);
+extern void reiser4_done_entd(struct super_block *);
+
+extern void reiser4_enter_flush(struct super_block *);
+extern void reiser4_leave_flush(struct super_block *);
+
+extern int write_page_by_ent(struct page *, struct writeback_control *);
+extern int wbq_available(void);
+extern void ent_writes_page(struct super_block *, struct page *);
+
+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
+/* __ENTD_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/eottl.c linux-2.6.22/fs/reiser4/eottl.c
--- linux-2.6.22.orig/fs/reiser4/eottl.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/eottl.c	2007-07-29 00:25:34.840687159 +0400
@@ -0,0 +1,509 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "tree_mod.h"
+#include "carry.h"
+#include "tree.h"
+#include "super.h"
+
+#include <linux/types.h>	/* for __u??  */
+
+/*
+ * Extents on the twig level (EOTTL) handling.
+ *
+ * EOTTL poses some problems to the tree traversal, that are better explained
+ * by example.
+ *
+ * Suppose we have block B1 on the twig level with the following items:
+ *
+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
+ * offset)
+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
+ * 2. internal item I2 with key (10:0:0:0)
+ *
+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
+ * then intra-node lookup is done. This lookup finished on the E1, because the
+ * key we are looking for is larger than the key of E1 and is smaller than key
+ * the of I2.
+ *
+ * Here search is stuck.
+ *
+ * After some thought it is clear what is wrong here: extents on the twig level
+ * break some basic property of the *search* tree (on the pretext, that they
+ * restore property of balanced tree).
+ *
+ * Said property is the following: if in the internal node of the search tree
+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
+ * through the Pointer.
+ *
+ * This is not true, when Pointer is Extent-Pointer, simply because extent
+ * cannot expand indefinitely to the right to include any item with
+ *
+ *   Key1 <= Key <= Key2.
+ *
+ * For example, our E1 extent is only responsible for the data with keys
+ *
+ *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
+ *
+ * so, key range
+ *
+ *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
+ *
+ * is orphaned: there is no way to get there from the tree root.
+ *
+ * In other words, extent pointers are different than normal child pointers as
+ * far as search tree is concerned, and this creates such problems.
+ *
+ * Possible solution for this problem is to insert our item into node pointed
+ * to by I2. There are some problems through:
+ *
+ * (1) I2 can be in a different node.
+ * (2) E1 can be immediately followed by another extent E2.
+ *
+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
+ * for locks/coords as necessary.
+ *
+ * (2) is more complex. Solution here is to insert new empty leaf node and
+ * insert internal item between E1 and E2 pointing to said leaf node. This is
+ * further complicated by possibility that E2 is in a different node, etc.
+ *
+ * Problems:
+ *
+ * (1) if there was internal item I2 immediately on the right of an extent E1
+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
+ * key of S1 will be less than smallest key in the N2. Normally, search key
+ * checks that key we are looking for is in the range of keys covered by the
+ * node key is being looked in. To work around of this situation, while
+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
+ * cbk falgs bitmask. This flag is automatically set on entrance to the
+ * coord_by_key() and is only cleared when we are about to enter situation
+ * described above.
+ *
+ * (2) If extent E1 is immediately followed by another extent E2 and we are
+ * searching for the key that is between E1 and E2 we only have to insert new
+ * empty leaf node when coord_by_key was called for insertion, rather than just
+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
+ * performed by insert_by_key() and friends.
+ *
+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
+ * case it requires modification of node content which is only possible under
+ * write lock. It may well happen that we only have read lock on the node where
+ * new internal pointer is to be inserted (common case: lookup of non-existent
+ * stat-data that fells between two extents). If only read lock is held, tree
+ * traversal is restarted with lock_level modified so that next time we hit
+ * this problem, write lock will be held. Once we have write lock, balancing
+ * will be performed.
+ */
+
+/**
+ * is_next_item_internal - check whether next item is internal
+ * @coord: coordinate of extent item in twig node
+ * @key: search key
+ * @lh: twig node lock handle
+ *
+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
+ * to that node, @coord is set to its first unit. If next item is not internal
+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
+ * is returned if search restart has to be done.
+ */
+static int
+is_next_item_internal(coord_t *coord, const reiser4_key *key,
+		      lock_handle *lh)
+{
+	coord_t next;
+	lock_handle rn;
+	int result;
+
+	coord_dup(&next, coord);
+	if (coord_next_unit(&next) == 0) {
+		/* next unit is in this node */
+		if (item_is_internal(&next)) {
+			coord_dup(coord, &next);
+			return 1;
+		}
+		assert("vs-3", item_is_extent(&next));
+		return 0;
+	}
+
+	/*
+	 * next unit either does not exist or is in right neighbor. If it is in
+	 * right neighbor we have to check right delimiting key because
+	 * concurrent thread could get their first and insert item with a key
+	 * smaller than @key
+	 */
+	read_lock_dk(current_tree);
+	result = keycmp(key, znode_get_rd_key(coord->node));
+	read_unlock_dk(current_tree);
+	assert("vs-6", result != EQUAL_TO);
+	if (result == GREATER_THAN)
+		return 2;
+
+	/* lock right neighbor */
+	init_lh(&rn);
+	result = reiser4_get_right_neighbor(&rn, coord->node,
+					    znode_is_wlocked(coord->node) ?
+					    ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
+					    GN_CAN_USE_UPPER_LEVELS);
+	if (result == -E_NO_NEIGHBOR) {
+		/* we are on the rightmost edge of the tree */
+		done_lh(&rn);
+		return 0;
+	}
+
+	if (result) {
+		assert("vs-4", result < 0);
+		done_lh(&rn);
+		return result;
+	}
+
+	/*
+	 * check whether concurrent thread managed to insert item with a key
+	 * smaller than @key
+	 */
+	read_lock_dk(current_tree);
+	result = keycmp(key, znode_get_ld_key(rn.node));
+	read_unlock_dk(current_tree);
+	assert("vs-6", result != EQUAL_TO);
+	if (result == GREATER_THAN) {
+		done_lh(&rn);
+		return 2;
+	}
+
+	result = zload(rn.node);
+	if (result) {
+		assert("vs-5", result < 0);
+		done_lh(&rn);
+		return result;
+	}
+
+	coord_init_first_unit(&next, rn.node);
+	if (item_is_internal(&next)) {
+		/*
+		 * next unit is in right neighbor and it is an unit of internal
+		 * item. Unlock coord->node. Move @lh to right neighbor. @coord
+		 * is set to the first unit of right neighbor.
+		 */
+		coord_dup(coord, &next);
+		zrelse(rn.node);
+		done_lh(lh);
+		move_lh(lh, &rn);
+		return 1;
+	}
+
+	/*
+	 * next unit is unit of extent item. Return without chaning @lh and
+	 * @coord.
+	 */
+	assert("vs-6", item_is_extent(&next));
+	zrelse(rn.node);
+	done_lh(&rn);
+	return 0;
+}
+
+/**
+ * rd_key - calculate key of an item next to the given one
+ * @coord: position in a node
+ * @key: storage for result key
+ *
+ * @coord is set between items or after the last item in a node. Calculate key
+ * of item to the right of @coord.
+ */
+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
+{
+	coord_t dup;
+
+	assert("nikita-2281", coord_is_between_items(coord));
+	coord_dup(&dup, coord);
+
+	if (coord_set_to_right(&dup) == 0)
+		/* next item is in this node. Return its key. */
+		unit_key_by_coord(&dup, key);
+	else {
+		/*
+		 * next item either does not exist or is in right
+		 * neighbor. Return znode's right delimiting key.
+		 */
+		read_lock_dk(current_tree);
+		*key = *znode_get_rd_key(coord->node);
+		read_unlock_dk(current_tree);
+	}
+	return key;
+}
+
+/**
+ * add_empty_leaf - insert empty leaf between two extents
+ * @insert_coord: position in twig node between two extents
+ * @lh: twig node lock handle
+ * @key: left delimiting key of new node
+ * @rdkey: right delimiting key of new node
+ *
+ * Inserts empty leaf node between two extent items. It is necessary when we
+ * have to insert an item on leaf level between two extents (items on the twig
+ * level).
+ */
+static int
+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
+	       const reiser4_key *key, const reiser4_key *rdkey)
+{
+	int result;
+	carry_pool *pool;
+	carry_level *todo;
+	reiser4_item_data *item;
+	carry_insert_data *cdata;
+	carry_op *op;
+	znode *node;
+	reiser4_tree *tree;
+
+	assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
+	tree = znode_get_tree(insert_coord->node);
+	node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	/* setup delimiting keys for node being inserted */
+	write_lock_dk(tree);
+	znode_set_ld_key(node, key);
+	znode_set_rd_key(node, rdkey);
+	ON_DEBUG(node->creator = current);
+	ON_DEBUG(node->first_key = *key);
+	write_unlock_dk(tree);
+
+	ZF_SET(node, JNODE_ORPHAN);
+
+	/*
+	 * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
+	 * carry_insert_data
+	 */
+	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
+			       sizeof(*item) + sizeof(*cdata));
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+	todo = (carry_level *) (pool + 1);
+	init_carry_level(todo, pool);
+
+	item = (reiser4_item_data *) (todo + 3);
+	cdata = (carry_insert_data *) (item + 1);
+
+	op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
+	if (!IS_ERR(op)) {
+		cdata->coord = insert_coord;
+		cdata->key = key;
+		cdata->data = item;
+		op->u.insert.d = cdata;
+		op->u.insert.type = COPT_ITEM_DATA;
+		build_child_ptr_data(node, item);
+		item->arg = NULL;
+		/* have @insert_coord to be set at inserted item after
+		   insertion is done */
+		todo->track_type = CARRY_TRACK_CHANGE;
+		todo->tracked = lh;
+
+		result = reiser4_carry(todo, NULL);
+		if (result == 0) {
+			/*
+			 * pin node in memory. This is necessary for
+			 * znode_make_dirty() below.
+			 */
+			result = zload(node);
+			if (result == 0) {
+				lock_handle local_lh;
+
+				/*
+				 * if we inserted new child into tree we have
+				 * to mark it dirty so that flush will be able
+				 * to process it.
+				 */
+				init_lh(&local_lh);
+				result = longterm_lock_znode(&local_lh, node,
+							     ZNODE_WRITE_LOCK,
+							     ZNODE_LOCK_LOPRI);
+				if (result == 0) {
+					znode_make_dirty(node);
+
+					/*
+					 * when internal item pointing to @node
+					 * was inserted into twig node
+					 * create_hook_internal did not connect
+					 * it properly because its right
+					 * neighbor was not known. Do it
+					 * here
+					 */
+					write_lock_tree(tree);
+					assert("nikita-3312",
+					       znode_is_right_connected(node));
+					assert("nikita-2984",
+					       node->right == NULL);
+					ZF_CLR(node, JNODE_RIGHT_CONNECTED);
+					write_unlock_tree(tree);
+					result =
+					    connect_znode(insert_coord, node);
+					ON_DEBUG(if (result == 0) check_dkeys(node););
+
+					done_lh(lh);
+					move_lh(lh, &local_lh);
+					assert("vs-1676", node_is_empty(node));
+					coord_init_first_unit(insert_coord,
+							      node);
+				} else {
+					warning("nikita-3136",
+						"Cannot lock child");
+				}
+				done_lh(&local_lh);
+				zrelse(node);
+			}
+		}
+	} else
+		result = PTR_ERR(op);
+	zput(node);
+	done_carry_pool(pool);
+	return result;
+}
+
+/**
+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
+ * @h: search handle
+ * @outcome: flag saying whether search has to restart or is done
+ *
+ * Handles search on twig level. If this function completes search itself then
+ * it returns 1. If search has to go one level down then 0 is returned. If
+ * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
+ * in @h->result.
+ */
+int handle_eottl(cbk_handle *h, int *outcome)
+{
+	int result;
+	reiser4_key key;
+	coord_t *coord;
+
+	coord = h->coord;
+
+	if (h->level != TWIG_LEVEL ||
+	    (coord_is_existing_item(coord) && item_is_internal(coord))) {
+		/* Continue to traverse tree downward. */
+		return 0;
+	}
+
+	/*
+	 * make sure that @h->coord is set to twig node and that it is either
+	 * set to extent item or after extent item
+	 */
+	assert("vs-356", h->level == TWIG_LEVEL);
+	assert("vs-357", ( {
+			  coord_t lcoord;
+			  coord_dup(&lcoord, coord);
+			  check_me("vs-733", coord_set_to_left(&lcoord) == 0);
+			  item_is_extent(&lcoord);
+			  }
+	       ));
+
+	if (*outcome == NS_FOUND) {
+		/* we have found desired key on twig level in extent item */
+		h->result = CBK_COORD_FOUND;
+		*outcome = LOOKUP_DONE;
+		return 1;
+	}
+
+	if (!(h->flags & CBK_FOR_INSERT)) {
+		/* tree traversal is not for insertion. Just return
+		   CBK_COORD_NOTFOUND. */
+		h->result = CBK_COORD_NOTFOUND;
+		*outcome = LOOKUP_DONE;
+		return 1;
+	}
+
+	/* take a look at the item to the right of h -> coord */
+	result = is_next_item_internal(coord, h->key, h->active_lh);
+	if (unlikely(result < 0)) {
+		h->error = "get_right_neighbor failed";
+		h->result = result;
+		*outcome = LOOKUP_DONE;
+		return 1;
+	}
+	if (result == 0) {
+		/*
+		 * item to the right is also an extent one. Allocate a new node
+		 * and insert pointer to it after item h -> coord.
+		 *
+		 * This is a result of extents being located at the twig
+		 * level. For explanation, see comment just above
+		 * is_next_item_internal().
+		 */
+		znode *loaded;
+
+		if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
+			/*
+			 * we got node read locked, restart coord_by_key to
+			 * have write lock on twig level
+			 */
+			h->lock_level = TWIG_LEVEL;
+			h->lock_mode = ZNODE_WRITE_LOCK;
+			*outcome = LOOKUP_REST;
+			return 1;
+		}
+
+		loaded = coord->node;
+		result =
+		    add_empty_leaf(coord, h->active_lh, h->key,
+				   rd_key(coord, &key));
+		if (result) {
+			h->error = "could not add empty leaf";
+			h->result = result;
+			*outcome = LOOKUP_DONE;
+			return 1;
+		}
+		/* added empty leaf is locked (h->active_lh), its parent node
+		   is unlocked, h->coord is set as EMPTY */
+		assert("vs-13", coord->between == EMPTY_NODE);
+		assert("vs-14", znode_is_write_locked(coord->node));
+		assert("vs-15",
+		       WITH_DATA(coord->node, node_is_empty(coord->node)));
+		assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
+		assert("vs-17", coord->node == h->active_lh->node);
+		*outcome = LOOKUP_DONE;
+		h->result = CBK_COORD_NOTFOUND;
+		return 1;
+	} else if (result == 1) {
+		/*
+		 * this is special case mentioned in the comment on
+		 * tree.h:cbk_flags. We have found internal item immediately on
+		 * the right of extent, and we are going to insert new item
+		 * there. Key of item we are going to insert is smaller than
+		 * leftmost key in the node pointed to by said internal item
+		 * (otherwise search wouldn't come to the extent in the first
+		 * place).
+		 *
+		 * This is a result of extents being located at the twig
+		 * level. For explanation, see comment just above
+		 * is_next_item_internal().
+		 */
+		h->flags &= ~CBK_TRUST_DK;
+	} else {
+		assert("vs-8", result == 2);
+		*outcome = LOOKUP_REST;
+		return 1;
+	}
+	assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
+	return 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * scroll-step: 1
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/estimate.c linux-2.6.22/fs/reiser4/estimate.c
--- linux-2.6.22.orig/fs/reiser4/estimate.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/estimate.c	2007-07-29 00:25:34.840687159 +0400
@@ -0,0 +1,120 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "debug.h"
+#include "dformat.h"
+#include "tree.h"
+#include "carry.h"
+#include "inode.h"
+#include "plugin/cluster.h"
+#include "plugin/item/ctail.h"
+
+/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
+
+   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
+   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
+   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
+   leaf level, 3 for twig level, 2 on upper + 1 for root.
+
+   Do not calculate the current node of the lowest level here - this is overhead only.
+
+   children is almost always 1 here. Exception is flow insertion
+*/
+static reiser4_block_nr
+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
+{
+	reiser4_block_nr ten_percent;
+
+	ten_percent = ((103 * childen) >> 10);
+
+	/* If we have too many balancings at the time, tree height can raise on more
+	   then 1. Assume that if tree_height is 5, it can raise on 1 only. */
+	return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
+}
+
+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
+   perform insertion of one item into the tree */
+/* it is only called when tree height changes, or gets initialized */
+reiser4_block_nr calc_estimate_one_insert(tree_level height)
+{
+	return 1 + max_balance_overhead(1, height);
+}
+
+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
+{
+	return tree->estimate_one_insert;
+}
+
+/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
+   perform insertion of one unit into an item in the tree */
+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
+{
+	/* estimate insert into item just like item insertion */
+	return tree->estimate_one_insert;
+}
+
+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
+{
+	/* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
+	   level */
+	return tree->estimate_one_insert;
+}
+
+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
+   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
+   levels */
+reiser4_block_nr estimate_insert_flow(tree_level height)
+{
+	return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
+								     CARRY_FLOW_NEW_NODES_LIMIT,
+								     height);
+}
+
+/* returnes max number of nodes can be occupied by disk cluster */
+static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
+{
+	int per_cluster;
+	per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
+	return 3 + per_cluster +
+		max_balance_overhead(3 + per_cluster,
+				     REISER4_MAX_ZTREE_HEIGHT);
+}
+
+/* how many nodes might get dirty and added
+   during insertion of a disk cluster */
+reiser4_block_nr estimate_insert_cluster(struct inode * inode)
+{
+	return estimate_cluster(inode, 1); /* 24 */
+}
+
+/* how many nodes might get dirty and added
+   during update of a (prepped or unprepped) disk cluster */
+reiser4_block_nr estimate_update_cluster(struct inode * inode)
+{
+	return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
+}
+
+/* How many nodes occupied by a disk cluster might get dirty.
+   Note that this estimation is not precise (i.e. disk cluster
+   can occupy more nodes).
+   Q: Why we don't use precise estimation?
+   A: 1.Because precise estimation is fairly bad: 65536 nodes
+        for 64K logical cluster, it means 256M of dead space on
+	a partition
+      2.It is a very rare case when disk cluster occupies more
+        nodes then this estimation returns.
+*/
+reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
+{
+	return cluster_nrpages(inode) + 4;
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   scroll-step: 1
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/export_ops.c linux-2.6.22/fs/reiser4/export_ops.c
--- linux-2.6.22.orig/fs/reiser4/export_ops.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/export_ops.c	2007-07-29 00:25:34.840687159 +0400
@@ -0,0 +1,295 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "inode.h"
+#include "plugin/plugin.h"
+
+/*
+ * Supported file-handle types
+ */
+typedef enum {
+	FH_WITH_PARENT = 0x10,	/* file handle with parent */
+	FH_WITHOUT_PARENT = 0x11	/* file handle without parent */
+} reiser4_fhtype;
+
+#define NFSERROR (255)
+
+/* initialize place-holder for object */
+static void object_on_wire_init(reiser4_object_on_wire *o)
+{
+	o->plugin = NULL;
+}
+
+/* finish with @o */
+static void object_on_wire_done(reiser4_object_on_wire *o)
+{
+	if (o->plugin != NULL)
+		o->plugin->wire.done(o);
+}
+
+/*
+ * read serialized object identity from @addr and store information about
+ * object in @obj. This is dual to encode_inode().
+ */
+static char *decode_inode(struct super_block *s, char *addr,
+			  reiser4_object_on_wire * obj)
+{
+	file_plugin *fplug;
+
+	/* identifier of object plugin is stored in the first two bytes,
+	 * followed by... */
+	fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
+	if (fplug != NULL) {
+		addr += sizeof(d16);
+		obj->plugin = fplug;
+		assert("nikita-3520", fplug->wire.read != NULL);
+		/* plugin specific encoding of object identity. */
+		addr = fplug->wire.read(addr, obj);
+	} else
+		addr = ERR_PTR(RETERR(-EINVAL));
+	return addr;
+}
+
+/**
+ * reiser4_decode_fh - decode_fh of export operations
+ * @super: super block
+ * @fh: nfsd file handle
+ * @len: length of file handle
+ * @fhtype: type of file handle
+ * @acceptable: acceptability testing function
+ * @context: argument for @acceptable
+ *
+ * Returns dentry referring to the same file as @fh.
+ */
+static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
+					int len, int fhtype,
+					int (*acceptable) (void *context,
+							   struct dentry *de),
+					void *context)
+{
+	reiser4_context *ctx;
+	reiser4_object_on_wire object;
+	reiser4_object_on_wire parent;
+	char *addr;
+	int with_parent;
+
+	ctx = reiser4_init_context(super);
+	if (IS_ERR(ctx))
+		return (struct dentry *)ctx;
+
+	assert("vs-1482",
+	       fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
+
+	with_parent = (fhtype == FH_WITH_PARENT);
+
+	addr = (char *)fh;
+
+	object_on_wire_init(&object);
+	object_on_wire_init(&parent);
+
+	addr = decode_inode(super, addr, &object);
+	if (!IS_ERR(addr)) {
+		if (with_parent)
+			addr = decode_inode(super, addr, &parent);
+		if (!IS_ERR(addr)) {
+			struct dentry *d;
+			typeof(super->s_export_op->find_exported_dentry) fn;
+
+			fn = super->s_export_op->find_exported_dentry;
+			assert("nikita-3521", fn != NULL);
+			d = fn(super, &object, with_parent ? &parent : NULL,
+			       acceptable, context);
+			if (d != NULL && !IS_ERR(d))
+				/* FIXME check for -ENOMEM */
+			  	reiser4_get_dentry_fsdata(d)->stateless = 1;
+			addr = (char *)d;
+		}
+	}
+
+	object_on_wire_done(&object);
+	object_on_wire_done(&parent);
+
+	reiser4_exit_context(ctx);
+	return (void *)addr;
+}
+
+/*
+ * Object serialization support.
+ *
+ * To support knfsd file system provides export_operations that are used to
+ * construct and interpret NFS file handles. As a generalization of this,
+ * reiser4 object plugins have serialization support: it provides methods to
+ * create on-wire representation of identity of reiser4 object, and
+ * re-create/locate object given its on-wire identity.
+ *
+ */
+
+/*
+ * return number of bytes that on-wire representation of @inode's identity
+ * consumes.
+ */
+static int encode_inode_size(struct inode *inode)
+{
+	assert("nikita-3514", inode != NULL);
+	assert("nikita-3515", inode_file_plugin(inode) != NULL);
+	assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
+
+	return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
+}
+
+/*
+ * store on-wire representation of @inode's identity at the area beginning at
+ * @start.
+ */
+static char *encode_inode(struct inode *inode, char *start)
+{
+	assert("nikita-3517", inode != NULL);
+	assert("nikita-3518", inode_file_plugin(inode) != NULL);
+	assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
+
+	/*
+	 * first, store two-byte identifier of object plugin, then
+	 */
+	save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
+		       (d16 *) start);
+	start += sizeof(d16);
+	/*
+	 * call plugin to serialize object's identity
+	 */
+	return inode_file_plugin(inode)->wire.write(inode, start);
+}
+
+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
+ * returned if file handle can not be stored */
+/**
+ * reiser4_encode_fh - encode_fh of export operations
+ * @dentry:
+ * @fh:
+ * @lenp:
+ * @need_parent:
+ *
+ */
+static int
+reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
+		  int need_parent)
+{
+	struct inode *inode;
+	struct inode *parent;
+	char *addr;
+	int need;
+	int delta;
+	int result;
+	reiser4_context *ctx;
+
+	/*
+	 * knfsd asks as to serialize object in @dentry, and, optionally its
+	 * parent (if need_parent != 0).
+	 *
+	 * encode_inode() and encode_inode_size() is used to build
+	 * representation of object and its parent. All hard work is done by
+	 * object plugins.
+	 */
+	inode = dentry->d_inode;
+	parent = dentry->d_parent->d_inode;
+
+	addr = (char *)fh;
+
+	need = encode_inode_size(inode);
+	if (need < 0)
+		return NFSERROR;
+	if (need_parent) {
+		delta = encode_inode_size(parent);
+		if (delta < 0)
+			return NFSERROR;
+		need += delta;
+	}
+
+	ctx = reiser4_init_context(dentry->d_inode->i_sb);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (need <= sizeof(__u32) * (*lenp)) {
+		addr = encode_inode(inode, addr);
+		if (need_parent)
+			addr = encode_inode(parent, addr);
+
+		/* store in lenp number of 32bit words required for file
+		 * handle. */
+		*lenp = (need + sizeof(__u32) - 1) >> 2;
+		result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
+	} else
+		/* no enough space in file handle */
+		result = NFSERROR;
+	reiser4_exit_context(ctx);
+	return result;
+}
+
+/**
+ * reiser4_get_dentry_parent - get_parent of export operations
+ * @child:
+ *
+ */
+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
+{
+	struct inode *dir;
+	dir_plugin *dplug;
+
+	assert("nikita-3527", child != NULL);
+	/* see comment in reiser4_get_dentry() about following assertion */
+	assert("nikita-3528", is_in_reiser4_context());
+
+	dir = child->d_inode;
+	assert("nikita-3529", dir != NULL);
+	dplug = inode_dir_plugin(dir);
+	assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
+	if (dplug != NULL)
+		return dplug->get_parent(dir);
+	else
+		return ERR_PTR(RETERR(-ENOTDIR));
+}
+
+/**
+ * reiser4_get_dentry - get_dentry of export operations
+ * @super:
+ * @data:
+ *
+ *
+ */
+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
+{
+	reiser4_object_on_wire *o;
+
+	assert("nikita-3522", super != NULL);
+	assert("nikita-3523", data != NULL);
+	/*
+	 * this is only supposed to be called by
+	 *
+	 *     reiser4_decode_fh->find_exported_dentry
+	 *
+	 * so, reiser4_context should be here already.
+	 */
+	assert("nikita-3526", is_in_reiser4_context());
+
+	o = (reiser4_object_on_wire *)data;
+	assert("nikita-3524", o->plugin != NULL);
+	assert("nikita-3525", o->plugin->wire.get != NULL);
+
+	return o->plugin->wire.get(super, o);
+}
+
+struct export_operations reiser4_export_operations = {
+	.encode_fh = reiser4_encode_fh,
+	.decode_fh = reiser4_decode_fh,
+	.get_parent = reiser4_get_dentry_parent,
+	.get_dentry = reiser4_get_dentry
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/flush.c linux-2.6.22/fs/reiser4/flush.c
--- linux-2.6.22.orig/fs/reiser4/flush.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/flush.c	2007-07-29 00:25:34.000000000 +0400
@@ -0,0 +1,3625 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/plugin.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "carry.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "wander.h"
+#include "super.h"
+#include "entd.h"
+#include "reiser4.h"
+#include "flush.h"
+#include "writeout.h"
+
+#include <asm/atomic.h>
+#include <linux/fs.h>		/* for struct super_block  */
+#include <linux/mm.h>		/* for struct page */
+#include <linux/bio.h>		/* for struct bio */
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+
+/* IMPLEMENTATION NOTES */
+
+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
+   order to the nodes of the tree in which the parent is placed before its children, which
+   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
+   describes the node that "came before in forward parent-first order".  When we speak of a
+   "parent-first follower", it describes the node that "comes next in parent-first
+   order" (alternatively the node that "came before in reverse parent-first order").
+
+   The following pseudo-code prints the nodes of a tree in forward parent-first order:
+
+   void parent_first (node)
+   {
+     print_node (node);
+     if (node->level > leaf) {
+       for (i = 0; i < num_children; i += 1) {
+         parent_first (node->child[i]);
+       }
+     }
+   }
+*/
+
+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
+   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
+   can be accomplished with sequential reads, which results in reading nodes in their
+   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
+   there is also a write-optimization aspect, which is that we wish to make large
+   sequential writes to the disk by allocating or reallocating blocks so that they can be
+   written in sequence.  Sometimes the read-optimization and write-optimization goals
+   conflict with each other, as we discuss in more detail below.
+*/
+
+/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
+   the relevant jnode->state bits and their relevence to flush:
+
+     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
+     must be allocated first.  In order to be considered allocated, the jnode must have
+     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
+     all dirtied jnodes eventually have one of these bits set during each transaction.
+
+     JNODE_CREATED: The node was freshly created in its transaction and has no previous
+     block address, so it is unconditionally assigned to be relocated, although this is
+     mainly for code-convenience.  It is not being 'relocated' from anything, but in
+     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
+     remains set even after JNODE_RELOC is set, so the actual relocate can be
+     distinguished from the created-and-allocated set easily: relocate-set members
+     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
+     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
+
+     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
+     decision to maintain the pre-existing location for this node and it will be written
+     to the wandered-log.
+
+     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
+     not created, see note above).  A block with JNODE_RELOC set is eligible for
+     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
+     bit is set on a znode, the parent node's internal item is modified and the znode is
+     rehashed.
+
+     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
+     and calls plugin->f.squeeze() method for its items. By this technology we update disk
+     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
+     has this flag (races with write(), rare case) the flush algorythm makes the decision
+     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
+     repeated allocation.
+
+     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
+     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
+     moved to one of the flush queue (see flush_queue.h) object private list. This
+     prevents multiple concurrent flushes from attempting to start flushing from the
+     same node.
+
+     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
+     squeeze-and-allocate on a node while its children are actively being squeezed and
+     allocated.  This flag was created to avoid submitting a write request for a node
+     while its children are still being allocated and squeezed. Then flush queue was
+     re-implemented to allow unlimited number of nodes be queued. This flag support was
+     commented out in source code because we decided that there was no reason to submit
+     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
+     during a slum traversal and may submit "busy nodes" to disk. Probably we can
+     re-enable the JNODE_FLUSH_BUSY bit support in future.
+
+   With these state bits, we describe a test used frequently in the code below,
+   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
+   test for "flushprepped" returns true if any of the following are true:
+
+     - The node is not dirty
+     - The node has JNODE_RELOC set
+     - The node has JNODE_OVRWR set
+
+   If either the node is not dirty or it has already been processed by flush (and assigned
+   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
+   true then flush has work to do on that node.
+*/
+
+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
+   flushprepped twice (unless an explicit call to flush_unprep is made as described in
+   detail below).  For example a node is dirtied, allocated, and then early-flushed to
+   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
+   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
+   the node to a new disk location, it will simply write it to the same, previously
+   relocated position again.
+*/
+
+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
+   start at a leaf node and allocate in parent-first order by iterating to the right.  At
+   each step of the iteration, we check for the right neighbor.  Before advancing to the
+   right neighbor, we check if the current position and the right neighbor share the same
+   parent.  If they do not share the same parent, the parent is allocated before the right
+   neighbor.
+
+   This process goes recursively up the tree and squeeze nodes level by level as long as
+   the right neighbor and the current position have different parents, then it allocates
+   the right-neighbors-with-different-parents on the way back down.  This process is
+   described in more detail in flush_squalloc_changed_ancestor and the recursive function
+   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
+   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
+   approaches.
+
+   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
+   approach, we find a starting point by scanning left along each level past dirty nodes,
+   then going up and repeating the process until the left node and the parent node are
+   clean.  We then perform a parent-first traversal from the starting point, which makes
+   allocating in parent-first order trivial.  After one subtree has been allocated in this
+   manner, we move to the right, try moving upward, then repeat the parent-first
+   traversal.
+
+   Both approaches have problems that need to be addressed.  Both are approximately the
+   same amount of code, but the bottom-up approach has advantages in the order it acquires
+   locks which, at the very least, make it the better approach.  At first glance each one
+   makes the other one look simpler, so it is important to remember a few of the problems
+   with each one.
+
+   Main problem with the top-down approach: When you encounter a clean child during the
+   parent-first traversal, what do you do?  You would like to avoid searching through a
+   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
+   obvious solution.  One of the advantages of the top-down approach is that during the
+   parent-first traversal you check every child of a parent to see if it is dirty.  In
+   this way, the top-down approach easily handles the main problem of the bottom-up
+   approach: unallocated children.
+
+   The unallocated children problem is that before writing a node to disk we must make
+   sure that all of its children are allocated.  Otherwise, the writing the node means
+   extra I/O because the node will have to be written again when the child is finally
+   allocated.
+
+   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
+   should not cause any file system corruption, it only degrades I/O performance because a
+   node may be written when it is sure to be written at least one more time in the same
+   transaction when the remaining children are allocated.  What follows is a description
+   of how we will solve the problem.
+*/
+
+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
+   proceeding in parent first order, allocate some of its left-children, then encounter a
+   clean child in the middle of the parent.  We do not allocate the clean child, but there
+   may remain unallocated (dirty) children to the right of the clean child.  If we were to
+   stop flushing at this moment and write everything to disk, the parent might still
+   contain unallocated children.
+
+   We could try to allocate all the descendents of every node that we allocate, but this
+   is not necessary.  Doing so could result in allocating the entire tree: if the root
+   node is allocated then every unallocated node would have to be allocated before
+   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
+   possible to allocate but not write a node during flush, when it still has unallocated
+   children.  However, this approach is probably not optimal for the following reason.
+
+   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
+   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
+   left-to-right scan through all the leaves in the system, and we are hoping to
+   write-optimize at the same time because those nodes will be written together in batch.
+   What happens, however, if we assign a block number to a node in its read-optimized
+   order but then avoid writing it because it has unallocated children?  In that
+   situation, we lose out on the write-optimization aspect because a node will have to be
+   written again to the its location on the device, later, which likely means seeking back
+   to that location.
+
+   So there are tradeoffs. We can choose either:
+
+   A. Allocate all unallocated children to preserve both write-optimization and
+   read-optimization, but this is not always desirable because it may mean having to
+   allocate and flush very many nodes at once.
+
+   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
+   but sacrifice write-optimization because those nodes will be written again.
+
+   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
+   locations.  Instead, choose to write-optimize them later, when they are written.  To
+   facilitate this, we "undo" the read-optimized allocation that was given to the node so
+   that later it can be write-optimized, thus "unpreparing" the flush decision.  This is a
+   case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above.  By a
+   call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
+   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
+   location, and set the JNODE_CREATED bit, effectively setting the node back to an
+   unallocated state.
+
+   We will take the following approach in v4.0: for twig nodes we will always finish
+   allocating unallocated children (A).  For nodes with (level > TWIG) we will defer
+   writing and choose write-optimization (C).
+
+   To summarize, there are several parts to a solution that avoids the problem with
+   unallocated children:
+
+   FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
+   problem because there was an experiment which was done showed that we have 1-2 nodes
+   with unallocated children for thousands of written nodes.  The experiment was simple
+   like coping / deletion of linux kernel sources.  However the problem can arise in more
+   complex tests.  I think we have jnode_io_hook to insert a check for unallocated
+   children and see what kind of problem we have.
+
+   1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
+   squeeze-and-allocate on any remaining unallocated children.  FIXME: Difficulty to
+   implement: should be simple -- amounts to adding a while loop to jnode_flush, see
+   comments in that function.
+
+   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
+   have unallocated children.  If the twig level has unallocated children it is an
+   assertion failure.  If a higher-level node has unallocated children, then it should be
+   explicitly de-allocated by a call to flush_unprep().  FIXME: Difficulty to implement:
+   should be simple.
+
+   3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
+   CPU cycles than we would like, and it is possible (but medium complexity) to optimize
+   this somewhat in the case where large sub-trees are flushed.  The following observation
+   helps: if both the left- and right-neighbor of a node are processed by the flush
+   algorithm then the node itself is guaranteed to have all of its children allocated.
+   However, the cost of this check may not be so expensive after all: it is not needed for
+   leaves and flush can guarantee this property for twigs.  That leaves only (level >
+   TWIG) nodes that have to be checked, so this optimization only helps if at least three
+   (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
+   there are many more (level > TWIG) nodes.  But if there are many (level > TWIG) nodes
+   then the number of blocks being written will be very large, so the savings may be
+   insignificant.  That said, the idea is to maintain both the left and right edges of
+   nodes that are processed in flush.  When flush_empty_queue() is called, a relatively
+   simple test will tell whether the (level > TWIG) node is on the edge.  If it is on the
+   edge, the slow check is necessary, but if it is in the interior then it can be assumed
+   to have all of its children allocated.  FIXME: medium complexity to implement, but
+   simple to verify given that we must have a slow check anyway.
+
+   4. (Optional) This part is optional, not for v4.0--flush should work independently of
+   whether this option is used or not.  Called RAPID_SCAN, the idea is to amend the
+   left-scan operation to take unallocated children into account.  Normally, the left-scan
+   operation goes left as long as adjacent nodes are dirty up until some large maximum
+   value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing.  But scan-left
+   may stop at a position where there are unallocated children to the left with the same
+   parent.  When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
+   FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
+   with a rapid scan.  The rapid scan skips all the interior children of a node--if the
+   leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
+   twig to the left).  If the left neighbor of the leftmost child is also dirty, then
+   continue the scan at the left twig and repeat.  This option will cause flush to
+   allocate more twigs in a single pass, but it also has the potential to write many more
+   nodes than would otherwise be written without the RAPID_SCAN option.  RAPID_SCAN
+   was partially implemented, code removed August 12, 2002 by JMACD.
+*/
+
+/* FLUSH CALLED ON NON-LEAF LEVEL.  Most of our design considerations assume that the
+   starting point for flush is a leaf node, but actually the flush code cares very little
+   about whether or not this is true.  It is possible that all the leaf nodes are flushed
+   and dirty parent nodes still remain, in which case jnode_flush() is called on a
+   non-leaf argument.  Flush doesn't care--it treats the argument node as if it were a
+   leaf, even when it is not.  This is a simple approach, and there may be a more optimal
+   policy but until a problem with this approach is discovered, simplest is probably best.
+
+   NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
+   the leaves.  This is done as a matter of simplicity and there is only one (shaky)
+   justification.  When an atom commits, it flushes all leaf level nodes first, followed
+   by twigs, and so on.  With flushing done in this order, if flush is eventually called
+   on a non-leaf node it means that (somehow) we reached a point where all leaves are
+   clean and only internal nodes need to be flushed.  If that it the case, then it means
+   there were no leaves that were the parent-first preceder/follower of the parent.  This
+   is expected to be a rare case, which is why we do nothing special about it.  However,
+   memory pressure may pass an internal node to flush when there are still dirty leaf
+   nodes that need to be flushed, which could prove our original assumptions
+   "inoperative".  If this needs to be fixed, then scan_left/right should have
+   special checks for the non-leaf levels.  For example, instead of passing from a node to
+   the left neighbor, it should pass from the node to the left neighbor's rightmost
+   descendent (if dirty).
+
+*/
+
+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING.  We walk the tree in 4MB-16MB chunks, dirtying everything and putting
+   it into a transaction.  We tell the allocator to allocate the blocks as far as possible towards one end of the
+   logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
+   device if we are walking from right to left.  We then make passes in alternating directions, and as we do this the
+   device becomes sorted such that tree order and block number order fully correlate.
+
+   Resizing is done by shifting everything either all the way to the left or all the way
+   to the right, and then reporting the last block.
+*/
+
+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.  This
+   descibes the policy from the highest level:
+
+   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
+   leaf level during flush-scan (right, left), then we unconditionally decide to relocate
+   leaf nodes.
+
+   Otherwise, there are two contexts in which we make a decision to relocate:
+
+   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
+   During the initial stages of flush, after scan-right completes, we want to ask the
+   question: should we relocate this leaf node and thus dirty the parent node.  Then if
+   the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
+   the question at the next level up, and so on.  In these cases we are moving in the
+   reverse-parent first direction.
+
+   There is another case which is considered the reverse direction, which comes at the end
+   of a twig in reverse_relocate_end_of_twig().  As we finish processing a twig we may
+   reach a point where there is a clean twig to the right with a dirty leftmost child.  In
+   this case, we may wish to relocate the child by testing if it should be relocated
+   relative to its parent.
+
+   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
+   allocate_znode.  What distinguishes the forward parent-first case from the
+   reverse-parent first case is that the preceder has already been allocated in the
+   forward case, whereas in the reverse case we don't know what the preceder is until we
+   finish "going in reverse".  That simplifies the forward case considerably, and there we
+   actually use the block allocator to determine whether, e.g., a block closer to the
+   preceder is available.
+*/
+
+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration.  The idea is, once we
+   finish scan-left and find a starting point, if the parent's left neighbor is dirty then
+   squeeze the parent's left neighbor and the parent.  This may change the
+   flush-starting-node's parent.  Repeat until the child's parent is stable.  If the child
+   is a leftmost child, repeat this left-edge squeezing operation at the next level up.
+   Note that we cannot allocate extents during this or they will be out of parent-first
+   order.  There is also some difficult coordinate maintenence issues.  We can't do a tree
+   search to find coordinates again (because we hold locks), we have to determine them
+   from the two nodes being squeezed.  Looks difficult, but has potential to increase
+   space utilization. */
+
+/* Flush-scan helper functions. */
+static void scan_init(flush_scan * scan);
+static void scan_done(flush_scan * scan);
+
+/* Flush-scan algorithm. */
+static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
+		     unsigned limit);
+static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
+static int scan_common(flush_scan * scan, flush_scan * other);
+static int scan_formatted(flush_scan * scan);
+static int scan_unformatted(flush_scan * scan, flush_scan * other);
+static int scan_by_coord(flush_scan * scan);
+
+/* Initial flush-point ancestor allocation. */
+static int alloc_pos_and_ancestors(flush_pos_t * pos);
+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
+
+/* Main flush algorithm.  Note on abbreviation: "squeeze and allocate" == "squalloc". */
+static int squalloc(flush_pos_t * pos);
+
+/* Flush squeeze implementation. */
+static int squeeze_right_non_twig(znode * left, znode * right);
+static int shift_one_internal_unit(znode * left, znode * right);
+
+/* Flush reverse parent-first relocation routines. */
+static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
+					    const reiser4_block_nr * nblk);
+static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
+				 flush_pos_t * pos);
+static int reverse_relocate_check_dirty_parent(jnode * node,
+					       const coord_t * parent_coord,
+					       flush_pos_t * pos);
+
+/* Flush allocate write-queueing functions: */
+static int allocate_znode(znode * node, const coord_t * parent_coord,
+			  flush_pos_t * pos);
+static int allocate_znode_update(znode * node, const coord_t * parent_coord,
+				 flush_pos_t * pos);
+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
+
+/* Flush helper functions: */
+static int jnode_lock_parent_coord(jnode * node,
+				   coord_t * coord,
+				   lock_handle * parent_lh,
+				   load_count * parent_zh,
+				   znode_lock_mode mode, int try);
+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
+			    znode_lock_mode mode, int check_dirty, int expected);
+static int znode_same_parents(znode * a, znode * b);
+
+static int znode_check_flushprepped(znode * node)
+{
+	return jnode_check_flushprepped(ZJNODE(node));
+}
+
+/* Flush position functions */
+static void pos_init(flush_pos_t * pos);
+static int pos_valid(flush_pos_t * pos);
+static void pos_done(flush_pos_t * pos);
+static int pos_stop(flush_pos_t * pos);
+
+/* check that @org is first jnode extent unit, if extent is unallocated,
+ * because all jnodes of unallocated extent are dirty and of the same atom. */
+#define checkchild(scan)						\
+assert("nikita-3435",							\
+       ergo(scan->direction == LEFT_SIDE &&				\
+            (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
+	    jnode_is_unformatted(scan->node) &&				\
+	    extent_is_unallocated(&scan->parent_coord),			\
+	    extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
+
+/* This flush_cnt variable is used to track the number of concurrent flush operations,
+   useful for debugging.  It is initialized in txnmgr.c out of laziness (because flush has
+   no static initializer function...) */
+ON_DEBUG(atomic_t flush_cnt;
+    )
+
+/* check fs backing device for write congestion */
+static int check_write_congestion(void)
+{
+	struct super_block *sb;
+	struct backing_dev_info *bdi;
+
+	sb = reiser4_get_current_sb();
+	bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
+	return bdi_write_congested(bdi);
+}
+
+/* conditionally write flush queue */
+static int write_prepped_nodes(flush_pos_t * pos)
+{
+	int ret;
+
+	assert("zam-831", pos);
+	assert("zam-832", pos->fq);
+
+	if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
+		return 0;
+
+	if (check_write_congestion())
+		return 0;
+
+	ret = reiser4_write_fq(pos->fq, pos->nr_written,
+		       WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
+	return ret;
+}
+
+/* Proper release all flush pos. resources then move flush position to new
+   locked node */
+static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
+			   load_count * new_load, const coord_t * new_coord)
+{
+	assert("zam-857", new_lock->node == new_load->node);
+
+	if (new_coord) {
+		assert("zam-858", new_coord->node == new_lock->node);
+		coord_dup(&pos->coord, new_coord);
+	} else {
+		coord_init_first_unit(&pos->coord, new_lock->node);
+	}
+
+	if (pos->child) {
+		jput(pos->child);
+		pos->child = NULL;
+	}
+
+	move_load_count(&pos->load, new_load);
+	done_lh(&pos->lock);
+	move_lh(&pos->lock, new_lock);
+}
+
+/* delete empty node which link from the parent still exists. */
+static int delete_empty_node(znode * node)
+{
+	reiser4_key smallest_removed;
+
+	assert("zam-1019", node != NULL);
+	assert("zam-1020", node_is_empty(node));
+	assert("zam-1023", znode_is_wlocked(node));
+
+	return reiser4_delete_node(node, &smallest_removed, NULL, 1);
+}
+
+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
+static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
+{
+	int ret;
+	load_count load;
+	lock_handle lock;
+
+	init_lh(&lock);
+	init_load_count(&load);
+
+	if (jnode_is_znode(org)) {
+		ret = longterm_lock_znode(&lock, JZNODE(org),
+					  ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+		if (ret)
+			return ret;
+
+		ret = incr_load_count_znode(&load, JZNODE(org));
+		if (ret)
+			return ret;
+
+		pos->state =
+		    (jnode_get_level(org) ==
+		     LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
+		move_flush_pos(pos, &lock, &load, NULL);
+	} else {
+		coord_t parent_coord;
+		ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
+					      &load, ZNODE_WRITE_LOCK, 0);
+		if (ret)
+			goto done;
+		if (!item_is_extent(&parent_coord)) {
+			/* file was converted to tail, org became HB, we found internal
+			   item */
+			ret = -EAGAIN;
+			goto done;
+		}
+
+		pos->state = POS_ON_EPOINT;
+		move_flush_pos(pos, &lock, &load, &parent_coord);
+		pos->child = jref(org);
+		if (extent_is_unallocated(&parent_coord)
+		    && extent_unit_index(&parent_coord) != index_jnode(org)) {
+			/* @org is not first child of its parent unit. This may happen
+			   because longerm lock of its parent node was released between
+			   scan_left and scan_right. For now work around this having flush to repeat */
+			ret = -EAGAIN;
+		}
+	}
+
+      done:
+	done_load_count(&load);
+	done_lh(&lock);
+	return ret;
+}
+
+/* TODO LIST (no particular order): */
+/* I have labelled most of the legitimate FIXME comments in this file with letters to
+   indicate which issue they relate to.  There are a few miscellaneous FIXMEs with
+   specific names mentioned instead that need to be inspected/resolved. */
+/* B. There is an issue described in reverse_relocate_test having to do with an
+   imprecise is_preceder? check having to do with partially-dirty extents.  The code that
+   sets preceder hints and computes the preceder is basically untested.  Careful testing
+   needs to be done that preceder calculations are done correctly, since if it doesn't
+   affect correctness we will not catch this stuff during regular testing. */
+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling.  It is unclear which of these are
+   considered expected but unlikely conditions.  Flush currently returns 0 (i.e., success
+   but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
+   Many of the calls that may produce one of these return values (i.e.,
+   longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
+   values themselves and, for instance, stop flushing instead of resulting in a restart.
+   If any of these results are true error conditions then flush will go into a busy-loop,
+   as we noticed during testing when a corrupt tree caused find_child_ptr to return
+   ENOENT.  It needs careful thought and testing of corner conditions.
+*/
+/* D. Atomicity of flush_prep against deletion and flush concurrency.  Suppose a created
+   block is assigned a block number then early-flushed to disk.  It is dirtied again and
+   flush is called again.  Concurrently, that block is deleted, and the de-allocation of
+   its block number does not need to be deferred, since it is not part of the preserve set
+   (i.e., it didn't exist before the transaction).  I think there may be a race condition
+   where flush writes the dirty, created block after the non-deferred deallocated block
+   number is re-allocated, making it possible to write deleted data on top of non-deleted
+   data.  Its just a theory, but it needs to be thought out. */
+/* F. bio_alloc() failure is not handled gracefully. */
+/* G. Unallocated children. */
+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
+
+/* JNODE_FLUSH: MAIN ENTRY POINT */
+/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
+   neighborhood is named "slum").  Jnode_flush() is called if reiser4 has to write dirty
+   blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
+   a part of transaction commit.
+
+   Our objective here is to prep and flush the slum the jnode belongs to. We want to
+   squish the slum together, and allocate the nodes in it as we squish because allocation
+   of children affects squishing of parents.
+
+   The "argument" @node tells flush where to start.  From there, flush finds the left edge
+   of the slum, and calls squalloc (in which nodes are squeezed and allocated).  To find a
+   "better place" to start squalloc first we perform a flush_scan.
+
+   Flush-scanning may be performed in both left and right directions, but for different
+   purposes.  When scanning to the left, we are searching for a node that precedes a
+   sequence of parent-first-ordered nodes which we will then flush in parent-first order.
+   During flush-scanning, we also take the opportunity to count the number of consecutive
+   leaf nodes.  If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
+   make a decision to reallocate leaf nodes (thus favoring write-optimization).
+
+   Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
+   also be dirty nodes to the right of the argument.  If the scan-left operation does not
+   count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
+   operation to see whether there is, in fact, enough nodes to meet the relocate
+   threshold.  Each right- and left-scan operation uses a single flush_scan object.
+
+   After left-scan and possibly right-scan, we prepare a flush_position object with the
+   starting flush point or parent coordinate, which was determined using scan-left.
+
+   Next we call the main flush routine, squalloc, which iterates along the
+   leaf level, squeezing and allocating nodes (and placing them into the flush queue).
+
+   After squalloc returns we take extra steps to ensure that all the children
+   of the final twig node are allocated--this involves repeating squalloc
+   until we finish at a twig with no unallocated children.
+
+   Finally, we call flush_empty_queue to submit write-requests to disk.  If we encounter
+   any above-twig nodes during flush_empty_queue that still have unallocated children, we
+   flush_unprep them.
+
+   Flush treats several "failure" cases as non-failures, essentially causing them to start
+   over.  E_DEADLOCK is one example.  FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
+   probably be handled properly rather than restarting, but there are a bunch of cases to
+   audit.
+*/
+
+static int
+jnode_flush(jnode * node, long nr_to_write, long *nr_written,
+	    flush_queue_t * fq, int flags)
+{
+	long ret = 0;
+	flush_scan *right_scan;
+	flush_scan *left_scan;
+	flush_pos_t *flush_pos;
+	int todo;
+	struct super_block *sb;
+	reiser4_super_info_data *sbinfo;
+	jnode *leftmost_in_slum = NULL;
+
+	assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
+	assert("nikita-3022", reiser4_schedulable());
+
+	assert("nikita-3185",
+	       get_current_super_private()->delete_mutex_owner != current);
+
+	/* allocate right_scan, left_scan and flush_pos */
+	right_scan =
+	    kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
+		    reiser4_ctx_gfp_mask_get());
+	if (right_scan == NULL)
+		return RETERR(-ENOMEM);
+	left_scan = right_scan + 1;
+	flush_pos = (flush_pos_t *) (left_scan + 1);
+
+	sb = reiser4_get_current_sb();
+	sbinfo = get_super_private(sb);
+
+	/* Flush-concurrency debug code */
+#if REISER4_DEBUG
+	atomic_inc(&flush_cnt);
+#endif
+
+	reiser4_enter_flush(sb);
+
+	/* Initialize a flush position. */
+	pos_init(flush_pos);
+
+	flush_pos->nr_written = nr_written;
+	flush_pos->fq = fq;
+	flush_pos->flags = flags;
+	flush_pos->nr_to_write = nr_to_write;
+
+	scan_init(right_scan);
+	scan_init(left_scan);
+
+	/* First scan left and remember the leftmost scan position.  If the leftmost
+	   position is unformatted we remember its parent_coord.  We scan until counting
+	   FLUSH_SCAN_MAXNODES.
+
+	   If starting @node is unformatted, at the beginning of left scan its
+	   parent (twig level node, containing extent item) will be long term
+	   locked and lock handle will be stored in the
+	   @right_scan->parent_lock. This lock is used to start the rightward
+	   scan without redoing the tree traversal (necessary to find parent)
+	   and, hence, is kept during leftward scan. As a result, we have to
+	   use try-lock when taking long term locks during the leftward scan.
+	 */
+	ret = scan_left(left_scan, right_scan,
+			node, sbinfo->flush.scan_maxnodes);
+	if (ret != 0)
+		goto failed;
+
+	leftmost_in_slum = jref(left_scan->node);
+	scan_done(left_scan);
+
+	/* Then possibly go right to decide if we will use a policy of relocating leaves.
+	   This is only done if we did not scan past (and count) enough nodes during the
+	   leftward scan.  If we do scan right, we only care to go far enough to establish
+	   that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed.  The
+	   scan limit is the difference between left_scan.count and the threshold. */
+
+	todo = sbinfo->flush.relocate_threshold - left_scan->count;
+	/* scan right is inherently deadlock prone, because we are
+	 * (potentially) holding a lock on the twig node at this moment.
+	 * FIXME: this is incorrect comment: lock is not held */
+	if (todo > 0) {
+		ret = scan_right(right_scan, node, (unsigned)todo);
+		if (ret != 0)
+			goto failed;
+	}
+
+	/* Only the right-scan count is needed, release any rightward locks right away. */
+	scan_done(right_scan);
+
+	/* ... and the answer is: we should relocate leaf nodes if at least
+	   FLUSH_RELOCATE_THRESHOLD nodes were found. */
+	flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
+	    (left_scan->count + right_scan->count >=
+	     sbinfo->flush.relocate_threshold);
+
+	/* Funny business here.  We set the 'point' in the flush_position at prior to
+	   starting squalloc regardless of whether the first point is
+	   formatted or unformatted.  Without this there would be an invariant, in the
+	   rest of the code, that if the flush_position is unformatted then
+	   flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
+	   and if the flush_position is formatted then flush_position->point is non-NULL
+	   and no parent info is set.
+
+	   This seems lazy, but it makes the initial calls to reverse_relocate_test
+	   (which ask "is it the pos->point the leftmost child of its parent") much easier
+	   because we know the first child already.  Nothing is broken by this, but the
+	   reasoning is subtle.  Holding an extra reference on a jnode during flush can
+	   cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
+	   removed from sibling lists until they have zero reference count.  Flush would
+	   never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
+	   deleted to the right.  So if nothing is broken, why fix it?
+
+	   NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
+	   point and in any moment, because of the concurrent file system
+	   activity (for example, truncate). */
+
+	/* Check jnode state after flush_scan completed. Having a lock on this
+	   node or its parent (in case of unformatted) helps us in case of
+	   concurrent flushing. */
+	if (jnode_check_flushprepped(leftmost_in_slum)
+	    && !jnode_convertible(leftmost_in_slum)) {
+		ret = 0;
+		goto failed;
+	}
+
+	/* Now setup flush_pos using scan_left's endpoint. */
+	ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
+	if (ret)
+		goto failed;
+
+	if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
+	    && node_is_empty(flush_pos->coord.node)) {
+		znode *empty = flush_pos->coord.node;
+
+		assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
+		ret = delete_empty_node(empty);
+		goto failed;
+	}
+
+	if (jnode_check_flushprepped(leftmost_in_slum)
+	    && !jnode_convertible(leftmost_in_slum)) {
+		ret = 0;
+		goto failed;
+	}
+
+	/* Set pos->preceder and (re)allocate pos and its ancestors if it is needed  */
+	ret = alloc_pos_and_ancestors(flush_pos);
+	if (ret)
+		goto failed;
+
+	/* Do the main rightward-bottom-up squeeze and allocate loop. */
+	ret = squalloc(flush_pos);
+	pos_stop(flush_pos);
+	if (ret)
+		goto failed;
+
+	/* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
+	   First, the pos_stop() and pos_valid() routines should be modified
+	   so that pos_stop() sets a flush_position->stop flag to 1 without
+	   releasing the current position immediately--instead release it in
+	   pos_done().  This is a better implementation than the current one anyway.
+
+	   It is not clear that all fields of the flush_position should not be released,
+	   but at the very least the parent_lock, parent_coord, and parent_load should
+	   remain held because they are hold the last twig when pos_stop() is
+	   called.
+
+	   When we reach this point in the code, if the parent_coord is set to after the
+	   last item then we know that flush reached the end of a twig (and according to
+	   the new flush queueing design, we will return now).  If parent_coord is not
+	   past the last item, we should check if the current twig has any unallocated
+	   children to the right (we are not concerned with unallocated children to the
+	   left--in that case the twig itself should not have been allocated).  If the
+	   twig has unallocated children to the right, set the parent_coord to that
+	   position and then repeat the call to squalloc.
+
+	   Testing for unallocated children may be defined in two ways: if any internal
+	   item has a fake block number, it is unallocated; if any extent item is
+	   unallocated then all of its children are unallocated.  But there is a more
+	   aggressive approach: if there are any dirty children of the twig to the right
+	   of the current position, we may wish to relocate those nodes now.  Checking for
+	   potential relocation is more expensive as it requires knowing whether there are
+	   any dirty children that are not unallocated.  The extent_needs_allocation
+	   should be used after setting the correct preceder.
+
+	   When we reach the end of a twig at this point in the code, if the flush can
+	   continue (when the queue is ready) it will need some information on the future
+	   starting point.  That should be stored away in the flush_handle using a seal, I
+	   believe.  Holding a jref() on the future starting point may break other code
+	   that deletes that node.
+	 */
+
+	/* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
+	   above the twig level.  If the VM calls flush above the twig level, do nothing
+	   and return (but figure out why this happens).  The txnmgr should be modified to
+	   only flush its leaf-level dirty list.  This will do all the necessary squeeze
+	   and allocate steps but leave unallocated branches and possibly unallocated
+	   twigs (when the twig's leftmost child is not dirty).  After flushing the leaf
+	   level, the remaining unallocated nodes should be given write-optimized
+	   locations.  (Possibly, the remaining unallocated twigs should be allocated just
+	   before their leftmost child.)
+	 */
+
+	/* Any failure reaches this point. */
+      failed:
+
+	switch (ret) {
+	case -E_REPEAT:
+	case -EINVAL:
+	case -E_DEADLOCK:
+	case -E_NO_NEIGHBOR:
+	case -ENOENT:
+		/* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
+		   in each case.  They already are handled in many cases. */
+		/* Something bad happened, but difficult to avoid...  Try again! */
+		ret = 0;
+	}
+
+	if (leftmost_in_slum)
+		jput(leftmost_in_slum);
+
+	pos_done(flush_pos);
+	scan_done(left_scan);
+	scan_done(right_scan);
+	kfree(right_scan);
+
+	ON_DEBUG(atomic_dec(&flush_cnt));
+
+	reiser4_leave_flush(sb);
+
+	return ret;
+}
+
+/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
+ * flusher should submit all prepped nodes immediately without keeping them in
+ * flush queues for long time.  The reason for rapid flush mode is to free
+ * memory as fast as possible. */
+
+#if REISER4_USE_RAPID_FLUSH
+
+/**
+ * submit all prepped nodes if rapid flush mode is set,
+ * turn rapid flush mode off.
+ */
+
+static int rapid_flush(flush_pos_t * pos)
+{
+	if (!wbq_available())
+		return 0;
+
+	return write_prepped_nodes(pos);
+}
+
+#else
+
+#define rapid_flush(pos) (0)
+
+#endif				/* REISER4_USE_RAPID_FLUSH */
+
+static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
+				     flush_queue_t *fq, int *nr_queued,
+				     int flags)
+{
+	jnode * node;
+
+	if (start != NULL) {
+		spin_lock_jnode(start);
+		if (!jnode_is_flushprepped(start)) {
+			assert("zam-1056", start->atom == atom);
+			node = start;
+			goto enter;
+		}
+		spin_unlock_jnode(start);
+	}
+	/*
+	 * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
+	 * nodes. The atom spin lock is not released until all dirty nodes processed or
+	 * not prepped node found in the atom dirty lists.
+	 */
+	while ((node = find_first_dirty_jnode(atom, flags))) {
+		spin_lock_jnode(node);
+	enter:
+		assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
+		assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
+
+		if (JF_ISSET(node, JNODE_WRITEBACK)) {
+			/* move node to the end of atom's writeback list */
+			list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
+
+			/*
+			 * jnode is not necessarily on dirty list: if it was dirtied when
+			 * it was on flush queue - it does not get moved to dirty list
+			 */
+			ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
+					     WB_LIST, 1));
+
+		} else if (jnode_is_znode(node)
+			   && znode_above_root(JZNODE(node))) {
+			/*
+			 * A special case for znode-above-root.  The above-root (fake)
+			 * znode is captured and dirtied when the tree height changes or
+			 * when the root node is relocated.  This causes atoms to fuse so
+			 * that changes at the root are serialized.  However, this node is
+			 * never flushed.  This special case used to be in lock.c to
+			 * prevent the above-root node from ever being captured, but now
+			 * that it is captured we simply prevent it from flushing.  The
+			 * log-writer code relies on this to properly log superblock
+			 * modifications of the tree height.
+			 */
+			jnode_make_wander_nolock(node);
+		} else if (JF_ISSET(node, JNODE_RELOC)) {
+			queue_jnode(fq, node);
+			++(*nr_queued);
+		} else
+			break;
+
+		spin_unlock_jnode(node);
+	}
+	return node;
+}
+
+/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
+ * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
+ * other errors as they are. */
+int
+flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
+		   txn_atom ** atom, jnode *start)
+{
+	reiser4_super_info_data *sinfo = get_current_super_private();
+	flush_queue_t *fq = NULL;
+	jnode *node;
+	int nr_queued;
+	int ret;
+
+	assert("zam-889", atom != NULL && *atom != NULL);
+	assert_spin_locked(&((*atom)->alock));
+	assert("zam-892", get_current_context()->trans->atom == *atom);
+
+	nr_to_write = LONG_MAX;
+	while (1) {
+		ret = reiser4_fq_by_atom(*atom, &fq);
+		if (ret != -E_REPEAT)
+			break;
+		*atom = get_current_atom_locked();
+	}
+	if (ret)
+		return ret;
+
+	assert_spin_locked(&((*atom)->alock));
+
+	/* parallel flushers limit */
+	if (sinfo->tmgr.atom_max_flushers != 0) {
+		while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
+			/* An reiser4_atom_send_event() call is inside
+			   reiser4_fq_put_nolock() which is called when flush is
+			   finished and nr_flushers is decremented. */
+			reiser4_atom_wait_event(*atom);
+			*atom = get_current_atom_locked();
+		}
+	}
+
+	/* count ourself as a flusher */
+	(*atom)->nr_flushers++;
+
+	writeout_mode_enable();
+
+	nr_queued = 0;
+	node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
+
+	if (node == NULL) {
+		if (nr_queued == 0) {
+			(*atom)->nr_flushers--;
+			reiser4_fq_put_nolock(fq);
+			reiser4_atom_send_event(*atom);
+			/* current atom remains locked */
+			writeout_mode_disable();
+			return 0;
+		}
+		spin_unlock_atom(*atom);
+	} else {
+		jref(node);
+		BUG_ON((*atom)->super != node->tree->super);
+		spin_unlock_atom(*atom);
+		spin_unlock_jnode(node);
+		BUG_ON(nr_to_write == 0);
+		ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
+		jput(node);
+	}
+
+	ret =
+	    reiser4_write_fq(fq, nr_submitted,
+		     WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
+
+	*atom = get_current_atom_locked();
+	(*atom)->nr_flushers--;
+	reiser4_fq_put_nolock(fq);
+	reiser4_atom_send_event(*atom);
+	spin_unlock_atom(*atom);
+
+	writeout_mode_disable();
+
+	if (ret == 0)
+		ret = -E_REPEAT;
+
+	return ret;
+}
+
+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
+
+/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
+   reverse parent-first relocate context.  Here all we know is the preceder and the block
+   number.  Since we are going in reverse, the preceder may still be relocated as well, so
+   we can't ask the block allocator "is there a closer block available to relocate?" here.
+   In the _forward_ parent-first relocate context (not here) we actually call the block
+   allocator to try and find a closer location. */
+static int
+reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
+				 const reiser4_block_nr * nblk)
+{
+	reiser4_block_nr dist;
+
+	assert("jmacd-7710", *pblk != 0 && *nblk != 0);
+	assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
+	assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
+
+	/* Distance is the absolute value. */
+	dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
+
+	/* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
+	   block, do not relocate. */
+	if (dist <= get_current_super_private()->flush.relocate_distance) {
+		return 0;
+	}
+
+	return 1;
+}
+
+/* This function is a predicate that tests for relocation.  Always called in the
+   reverse-parent-first context, when we are asking whether the current node should be
+   relocated in order to expand the flush by dirtying the parent level (and thus
+   proceeding to flush that level).  When traversing in the forward parent-first direction
+   (not here), relocation decisions are handled in two places: allocate_znode() and
+   extent_needs_allocation(). */
+static int
+reverse_relocate_test(jnode * node, const coord_t * parent_coord,
+		      flush_pos_t * pos)
+{
+	reiser4_block_nr pblk = 0;
+	reiser4_block_nr nblk = 0;
+
+	assert("jmacd-8989", !jnode_is_root(node));
+
+	/*
+	 * This function is called only from the
+	 * reverse_relocate_check_dirty_parent() and only if the parent
+	 * node is clean. This implies that the parent has the real (i.e., not
+	 * fake) block number, and, so does the child, because otherwise the
+	 * parent would be dirty.
+	 */
+
+	/* New nodes are treated as if they are being relocated. */
+	if (JF_ISSET (node, JNODE_CREATED) ||
+	    (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
+		return 1;
+	}
+
+	/* Find the preceder.  FIXME(B): When the child is an unformatted, previously
+	   existing node, the coord may be leftmost even though the child is not the
+	   parent-first preceder of the parent.  If the first dirty node appears somewhere
+	   in the middle of the first extent unit, this preceder calculation is wrong.
+	   Needs more logic in here. */
+	if (coord_is_leftmost_unit(parent_coord)) {
+		pblk = *znode_get_block(parent_coord->node);
+	} else {
+		pblk = pos->preceder.blk;
+	}
+	check_preceder(pblk);
+
+	/* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
+	if (pblk == 0) {
+		return 1;
+	}
+
+	nblk = *jnode_get_block(node);
+
+	if (reiser4_blocknr_is_fake(&nblk))
+		/* child is unallocated, mark parent dirty */
+		return 1;
+
+	return reverse_relocate_if_close_enough(&pblk, &nblk);
+}
+
+/* This function calls reverse_relocate_test to make a reverse-parent-first
+   relocation decision and then, if yes, it marks the parent dirty. */
+static int
+reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
+				    flush_pos_t * pos)
+{
+	int ret;
+
+	if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
+
+		ret = reverse_relocate_test(node, parent_coord, pos);
+		if (ret < 0) {
+			return ret;
+		}
+
+		/* FIXME-ZAM
+		   if parent is already relocated - we do not want to grab space, right? */
+		if (ret == 1) {
+			int grabbed;
+
+			grabbed = get_current_context()->grabbed_blocks;
+			if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
+			    0)
+				reiser4_panic("umka-1250",
+					      "No space left during flush.");
+
+			assert("jmacd-18923",
+			       znode_is_write_locked(parent_coord->node));
+			znode_make_dirty(parent_coord->node);
+			grabbed2free_mark(grabbed);
+		}
+	}
+
+	return 0;
+}
+
+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
+   PARENT-FIRST LOOP BEGINS) */
+
+/* Get the leftmost child for given coord. */
+static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
+{
+	int ret;
+
+	ret = item_utmost_child(coord, LEFT_SIDE, child);
+
+	if (ret)
+		return ret;
+
+	if (IS_ERR(*child))
+		return PTR_ERR(*child);
+
+	return 0;
+}
+
+/* This step occurs after the left- and right-scans are completed, before starting the
+   forward parent-first traversal.  Here we attempt to allocate ancestors of the starting
+   flush point, which means continuing in the reverse parent-first direction to the
+   parent, grandparent, and so on (as long as the child is a leftmost child).  This
+   routine calls a recursive process, alloc_one_ancestor, which does the real work,
+   except there is special-case handling here for the first ancestor, which may be a twig.
+   At each level (here and alloc_one_ancestor), we check for relocation and then, if
+   the child is a leftmost child, repeat at the next level.  On the way back down (the
+   recursion), we allocate the ancestors in parent-first order. */
+static int alloc_pos_and_ancestors(flush_pos_t * pos)
+{
+	int ret = 0;
+	lock_handle plock;
+	load_count pload;
+	coord_t pcoord;
+
+	if (znode_check_flushprepped(pos->lock.node))
+		return 0;
+
+	coord_init_invalid(&pcoord, NULL);
+	init_lh(&plock);
+	init_load_count(&pload);
+
+	if (pos->state == POS_ON_EPOINT) {
+		/* a special case for pos on twig level, where we already have
+		   a lock on parent node. */
+		/* The parent may not be dirty, in which case we should decide
+		   whether to relocate the child now. If decision is made to
+		   relocate the child, the parent is marked dirty. */
+		ret =
+		    reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
+							pos);
+		if (ret)
+			goto exit;
+
+		/* FIXME_NFQUCMPD: We only need to allocate the twig (if child
+		   is leftmost) and the leaf/child, so recursion is not needed.
+		   Levels above the twig will be allocated for
+		   write-optimization before the transaction commits.  */
+
+		/* Do the recursive step, allocating zero or more of our
+		 * ancestors. */
+		ret = alloc_one_ancestor(&pos->coord, pos);
+
+	} else {
+		if (!znode_is_root(pos->lock.node)) {
+			/* all formatted nodes except tree root */
+			ret =
+			    reiser4_get_parent(&plock, pos->lock.node,
+					       ZNODE_WRITE_LOCK);
+			if (ret)
+				goto exit;
+
+			ret = incr_load_count_znode(&pload, plock.node);
+			if (ret)
+				goto exit;
+
+			ret =
+			    find_child_ptr(plock.node, pos->lock.node, &pcoord);
+			if (ret)
+				goto exit;
+
+			ret =
+			    reverse_relocate_check_dirty_parent(ZJNODE
+								(pos->lock.
+								 node), &pcoord,
+								pos);
+			if (ret)
+				goto exit;
+
+			ret = alloc_one_ancestor(&pcoord, pos);
+			if (ret)
+				goto exit;
+		}
+
+		ret = allocate_znode(pos->lock.node, &pcoord, pos);
+	}
+      exit:
+	done_load_count(&pload);
+	done_lh(&plock);
+	return ret;
+}
+
+/* This is the recursive step described in alloc_pos_and_ancestors, above.  Ignoring the
+   call to set_preceder, which is the next function described, this checks if the
+   child is a leftmost child and returns if it is not.  If the child is a leftmost child
+   it checks for relocation, possibly dirtying the parent.  Then it performs the recursive
+   step. */
+static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
+{
+	int ret = 0;
+	lock_handle alock;
+	load_count aload;
+	coord_t acoord;
+
+	/* As we ascend at the left-edge of the region to flush, take this opportunity at
+	   the twig level to find our parent-first preceder unless we have already set
+	   it. */
+	if (pos->preceder.blk == 0) {
+		ret = set_preceder(coord, pos);
+		if (ret != 0)
+			return ret;
+	}
+
+	/* If the ancestor is clean or already allocated, or if the child is not a
+	   leftmost child, stop going up, even leaving coord->node not flushprepped. */
+	if (znode_check_flushprepped(coord->node)
+	    || !coord_is_leftmost_unit(coord))
+		return 0;
+
+	init_lh(&alock);
+	init_load_count(&aload);
+	coord_init_invalid(&acoord, NULL);
+
+	/* Only ascend to the next level if it is a leftmost child, but write-lock the
+	   parent in case we will relocate the child. */
+	if (!znode_is_root(coord->node)) {
+
+		ret =
+		    jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
+					    &alock, &aload, ZNODE_WRITE_LOCK,
+					    0);
+		if (ret != 0) {
+			/* FIXME(C): check EINVAL, E_DEADLOCK */
+			goto exit;
+		}
+
+		ret =
+		    reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
+							&acoord, pos);
+		if (ret != 0) {
+			goto exit;
+		}
+
+		/* Recursive call. */
+		if (!znode_check_flushprepped(acoord.node)) {
+			ret = alloc_one_ancestor(&acoord, pos);
+			if (ret)
+				goto exit;
+		}
+	}
+
+	/* Note: we call allocate with the parent write-locked (except at the root) in
+	   case we relocate the child, in which case it will modify the parent during this
+	   call. */
+	ret = allocate_znode(coord->node, &acoord, pos);
+
+      exit:
+	done_load_count(&aload);
+	done_lh(&alock);
+	return ret;
+}
+
+/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
+   a call to this function at the twig level.  During alloc_pos_and_ancestors we may ask:
+   should this node be relocated (in reverse parent-first context)?  We repeat this
+   process as long as the child is the leftmost child, eventually reaching an ancestor of
+   the flush point that is not a leftmost child.  The preceder of that ancestors, which is
+   not a leftmost child, is actually on the leaf level.  The preceder of that block is the
+   left-neighbor of the flush point.  The preceder of that block is the rightmost child of
+   the twig on the left.  So, when alloc_pos_and_ancestors passes upward through the twig
+   level, it stops momentarily to remember the block of the rightmost child of the twig on
+   the left and sets it to the flush_position's preceder_hint.
+
+   There is one other place where we may set the flush_position's preceder hint, which is
+   during scan-left.
+*/
+static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
+{
+	int ret;
+	coord_t coord;
+	lock_handle left_lock;
+	load_count left_load;
+
+	coord_dup(&coord, coord_in);
+
+	init_lh(&left_lock);
+	init_load_count(&left_load);
+
+	/* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
+	   coord_is_leftmost_unit is not the right test if the unformatted child is in the
+	   middle of the first extent unit. */
+	if (!coord_is_leftmost_unit(&coord)) {
+		coord_prev_unit(&coord);
+	} else {
+		ret =
+		    reiser4_get_left_neighbor(&left_lock, coord.node,
+					      ZNODE_READ_LOCK, GN_SAME_ATOM);
+		if (ret) {
+			/* If we fail for any reason it doesn't matter because the
+			   preceder is only a hint.  We are low-priority at this point, so
+			   this must be the case. */
+			if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
+			    ret == -ENOENT || ret == -EINVAL
+			    || ret == -E_DEADLOCK) {
+				ret = 0;
+			}
+			goto exit;
+		}
+
+		ret = incr_load_count_znode(&left_load, left_lock.node);
+		if (ret)
+			goto exit;
+
+		coord_init_last_unit(&coord, left_lock.node);
+	}
+
+	ret =
+	    item_utmost_child_real_block(&coord, RIGHT_SIDE,
+					 &pos->preceder.blk);
+      exit:
+	check_preceder(pos->preceder.blk);
+	done_load_count(&left_load);
+	done_lh(&left_lock);
+	return ret;
+}
+
+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
+
+/* This procedure implements the outer loop of the flush algorithm.  To put this in
+   context, here is the general list of steps taken by the flush routine as a whole:
+
+   1. Scan-left
+   2. Scan-right (maybe)
+   3. Allocate initial flush position and its ancestors
+   4. <handle extents>
+   5. <squeeze and next position and its ancestors to-the-right,
+       then update position to-the-right>
+   6. <repeat from #4 until flush is stopped>
+
+   This procedure implements the loop in steps 4 through 6 in the above listing.
+
+   Step 4: if the current flush position is an extent item (position on the twig level),
+   it allocates the extent (allocate_extent_item_in_place) then shifts to the next
+   coordinate.  If the next coordinate's leftmost child needs flushprep, we will continue.
+   If the next coordinate is an internal item, we descend back to the leaf level,
+   otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below).  If the "next coordinate"
+   brings us past the end of the twig level, then we call
+   reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
+   step #5 which moves to the right.
+
+   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
+   tree to allocate any ancestors of the next-right flush position that are not also
+   ancestors of the current position.  Those ancestors (in top-down order) are the next in
+   parent-first order.  We squeeze adjacent nodes on the way up until the right node and
+   current node share the same parent, then allocate on the way back down.  Finally, this
+   step sets the flush position to the next-right node.  Then repeat steps 4 and 5.
+*/
+
+/* SQUEEZE CODE */
+
+/* squalloc_right_twig helper function, cut a range of extent items from
+   cut node to->node from the beginning up to coord @to. */
+static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
+				   znode * left)
+{
+	coord_t from;
+	reiser4_key from_key;
+
+	coord_init_first_unit(&from, to->node);
+	item_key_by_coord(&from, &from_key);
+
+	return cut_node_content(&from, to, &from_key, to_key, NULL);
+}
+
+/* Copy as much of the leading extents from @right to @left, allocating
+   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
+   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
+   internal item it calls shift_one_internal_unit and may then return
+   SUBTREE_MOVED. */
+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
+{
+	int ret = SUBTREE_MOVED;
+	coord_t coord;		/* used to iterate over items */
+	reiser4_key stop_key;
+
+	assert("jmacd-2008", !node_is_empty(right));
+	coord_init_first_unit(&coord, right);
+
+	/* FIXME: can be optimized to cut once */
+	while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
+		ON_DEBUG(void *vp);
+
+		assert("vs-1468", coord_is_leftmost_unit(&coord));
+		ON_DEBUG(vp = shift_check_prepare(left, coord.node));
+
+		/* stop_key is used to find what was copied and what to cut */
+		stop_key = *reiser4_min_key();
+		ret = squalloc_extent(left, &coord, pos, &stop_key);
+		if (ret != SQUEEZE_CONTINUE) {
+			ON_DEBUG(kfree(vp));
+			break;
+		}
+		assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
+
+		/* Helper function to do the cutting. */
+		set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
+		check_me("vs-1466",
+			 squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
+
+		ON_DEBUG(shift_check(vp, left, coord.node));
+	}
+
+	if (node_is_empty(coord.node))
+		ret = SQUEEZE_SOURCE_EMPTY;
+
+	if (ret == SQUEEZE_TARGET_FULL) {
+		goto out;
+	}
+
+	if (node_is_empty(right)) {
+		/* The whole right node was copied into @left. */
+		assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
+		goto out;
+	}
+
+	coord_init_first_unit(&coord, right);
+
+	if (!item_is_internal(&coord)) {
+		/* we do not want to squeeze anything else to left neighbor because "slum"
+		   is over */
+		ret = SQUEEZE_TARGET_FULL;
+		goto out;
+	}
+	assert("jmacd-433", item_is_internal(&coord));
+
+	/* Shift an internal unit.  The child must be allocated before shifting any more
+	   extents, so we stop here. */
+	ret = shift_one_internal_unit(left, right);
+
+      out:
+	assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
+	       || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
+
+	if (ret == SQUEEZE_TARGET_FULL) {
+		/* We submit prepped nodes here and expect that this @left twig
+		 * will not be modified again during this jnode_flush() call. */
+		int ret1;
+
+		/* NOTE: seems like io is done under long term locks. */
+		ret1 = write_prepped_nodes(pos);
+		if (ret1 < 0)
+			return ret1;
+	}
+
+	return ret;
+}
+
+#if REISER4_DEBUG
+static void item_convert_invariant(flush_pos_t * pos)
+{
+	assert("edward-1225", coord_is_existing_item(&pos->coord));
+	if (chaining_data_present(pos)) {
+		item_plugin *iplug = item_convert_plug(pos);
+
+		assert("edward-1000",
+		       iplug == item_plugin_by_coord(&pos->coord));
+		assert("edward-1001", iplug->f.convert != NULL);
+	} else
+		assert("edward-1226", pos->child == NULL);
+}
+#else
+
+#define item_convert_invariant(pos) noop
+
+#endif
+
+/* Scan node items starting from the first one and apply for each
+   item its flush ->convert() method (if any). This method may
+   resize/kill the item so the tree will be changed.
+*/
+static int convert_node(flush_pos_t * pos, znode * node)
+{
+	int ret = 0;
+	item_plugin *iplug;
+
+	assert("edward-304", pos != NULL);
+	assert("edward-305", pos->child == NULL);
+	assert("edward-475", znode_convertible(node));
+	assert("edward-669", znode_is_wlocked(node));
+	assert("edward-1210", !node_is_empty(node));
+
+	if (znode_get_level(node) != LEAF_LEVEL)
+		/* unsupported */
+		goto exit;
+
+	coord_init_first_unit(&pos->coord, node);
+
+	while (1) {
+		ret = 0;
+		coord_set_to_left(&pos->coord);
+		item_convert_invariant(pos);
+
+		iplug = item_plugin_by_coord(&pos->coord);
+		assert("edward-844", iplug != NULL);
+
+		if (iplug->f.convert) {
+			ret = iplug->f.convert(pos);
+			if (ret)
+				goto exit;
+		}
+		assert("edward-307", pos->child == NULL);
+
+		if (coord_next_item(&pos->coord)) {
+			/* node is over */
+
+			if (!chaining_data_present(pos))
+				/* finished this node */
+				break;
+			if (should_chain_next_node(pos)) {
+				/* go to next node */
+				move_chaining_data(pos, 0 /* to next node */ );
+				break;
+			}
+			/* repeat this node */
+			move_chaining_data(pos, 1 /* this node */ );
+			continue;
+		}
+		/* Node is not over.
+		   Check if there is attached convert data.
+		   If so roll one item position back and repeat
+		   on this node
+		 */
+		if (chaining_data_present(pos)) {
+
+			if (iplug != item_plugin_by_coord(&pos->coord))
+				set_item_convert_count(pos, 0);
+
+			ret = coord_prev_item(&pos->coord);
+			assert("edward-1003", !ret);
+
+			move_chaining_data(pos, 1 /* this node */ );
+		}
+	}
+	JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
+	znode_make_dirty(node);
+      exit:
+	assert("edward-1004", !ret);
+	return ret;
+}
+
+/* Squeeze and allocate the right neighbor.  This is called after @left and
+   its current children have been squeezed and allocated already.  This
+   procedure's job is to squeeze and items from @right to @left.
+
+   If at the leaf level, use the shift_everything_left memcpy-optimized
+   version of shifting (squeeze_right_leaf).
+
+   If at the twig level, extents are allocated as they are shifted from @right
+   to @left (squalloc_right_twig).
+
+   At any other level, shift one internal item and return to the caller
+   (squalloc_parent_first) so that the shifted-subtree can be processed in
+   parent-first order.
+
+   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
+   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
+   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
+   is returned.
+*/
+
+static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
+				  znode * right)
+{
+	int ret;
+
+	/* FIXME it is possible to see empty hasn't-heard-banshee node in a
+	 * tree owing to error (for example, ENOSPC) in write */
+	/* assert("jmacd-9321", !node_is_empty(left)); */
+	assert("jmacd-9322", !node_is_empty(right));
+	assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
+
+	switch (znode_get_level(left)) {
+	case TWIG_LEVEL:
+		/* Shift with extent allocating until either an internal item
+		   is encountered or everything is shifted or no free space
+		   left in @left */
+		ret = squeeze_right_twig(left, right, pos);
+		break;
+
+	default:
+		/* All other levels can use shift_everything until we implement per-item
+		   flush plugins. */
+		ret = squeeze_right_non_twig(left, right);
+		break;
+	}
+
+	assert("jmacd-2011", (ret < 0 ||
+			      ret == SQUEEZE_SOURCE_EMPTY
+			      || ret == SQUEEZE_TARGET_FULL
+			      || ret == SUBTREE_MOVED));
+	return ret;
+}
+
+static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
+						znode * right)
+{
+	int ret;
+
+	ret = squeeze_right_twig(pos->lock.node, right, pos);
+	if (ret < 0)
+		return ret;
+	if (ret > 0) {
+		coord_init_after_last_item(&pos->coord, pos->lock.node);
+		return ret;
+	}
+
+	coord_init_last_unit(&pos->coord, pos->lock.node);
+	return 0;
+}
+
+/* forward declaration */
+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
+
+/* do a fast check for "same parents" condition before calling
+ * squalloc_upper_levels() */
+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
+							  znode * left,
+							  znode * right)
+{
+	if (znode_same_parents(left, right))
+		return 0;
+
+	return squalloc_upper_levels(pos, left, right);
+}
+
+/* Check whether the parent of given @right node needs to be processes
+   ((re)allocated) prior to processing of the child.  If @left and @right do not
+   share at least the parent of the @right is after the @left but before the
+   @right in parent-first order, we have to (re)allocate it before the @right
+   gets (re)allocated. */
+static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
+{
+	int ret;
+
+	lock_handle left_parent_lock;
+	lock_handle right_parent_lock;
+
+	load_count left_parent_load;
+	load_count right_parent_load;
+
+	init_lh(&left_parent_lock);
+	init_lh(&right_parent_lock);
+
+	init_load_count(&left_parent_load);
+	init_load_count(&right_parent_load);
+
+	ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
+	if (ret)
+		goto out;
+
+	ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
+	if (ret)
+		goto out;
+
+	/* Check for same parents */
+	if (left_parent_lock.node == right_parent_lock.node)
+		goto out;
+
+	if (znode_check_flushprepped(right_parent_lock.node)) {
+		/* Keep parent-first order.  In the order, the right parent node stands
+		   before the @right node.  If it is already allocated, we set the
+		   preceder (next block search start point) to its block number, @right
+		   node should be allocated after it.
+
+		   However, preceder is set only if the right parent is on twig level.
+		   The explanation is the following: new branch nodes are allocated over
+		   already allocated children while the tree grows, it is difficult to
+		   keep tree ordered, we assume that only leaves and twings are correctly
+		   allocated.  So, only twigs are used as a preceder for allocating of the
+		   rest of the slum. */
+		if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
+			pos->preceder.blk =
+			    *znode_get_block(right_parent_lock.node);
+			check_preceder(pos->preceder.blk);
+		}
+		goto out;
+	}
+
+	ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
+	if (ret)
+		goto out;
+
+	ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
+	if (ret)
+		goto out;
+
+	ret =
+	    squeeze_right_neighbor(pos, left_parent_lock.node,
+				   right_parent_lock.node);
+	/* We stop if error. We stop if some items/units were shifted (ret == 0)
+	 * and thus @right changed its parent. It means we have not process
+	 * right_parent node prior to processing of @right. Positive return
+	 * values say that shifting items was not happen because of "empty
+	 * source" or "target full" conditions. */
+	if (ret <= 0)
+		goto out;
+
+	/* parent(@left) and parent(@right) may have different parents also. We
+	 * do a recursive call for checking that. */
+	ret =
+	    check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
+						    right_parent_lock.node);
+	if (ret)
+		goto out;
+
+	/* allocate znode when going down */
+	ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
+
+      out:
+	done_load_count(&left_parent_load);
+	done_load_count(&right_parent_load);
+
+	done_lh(&left_parent_lock);
+	done_lh(&right_parent_lock);
+
+	return ret;
+}
+
+/* Check the leftmost child "flushprepped" status, also returns true if child
+ * node was not found in cache.  */
+static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
+{
+	int ret;
+	int prepped;
+
+	jnode *child;
+
+	ret = get_leftmost_child_of_unit(coord, &child);
+
+	if (ret)
+		return ret;
+
+	if (child) {
+		prepped = jnode_check_flushprepped(child);
+		jput(child);
+	} else {
+		/* We consider not existing child as a node which slum
+		   processing should not continue to.  Not cached node is clean,
+		   so it is flushprepped. */
+		prepped = 1;
+	}
+
+	return prepped;
+}
+
+/* (re)allocate znode with automated getting parent node */
+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
+{
+	int ret;
+	lock_handle parent_lock;
+	load_count parent_load;
+	coord_t pcoord;
+
+	assert("zam-851", znode_is_write_locked(node));
+
+	init_lh(&parent_lock);
+	init_load_count(&parent_load);
+
+	ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
+	if (ret)
+		goto out;
+
+	ret = incr_load_count_znode(&parent_load, parent_lock.node);
+	if (ret)
+		goto out;
+
+	ret = find_child_ptr(parent_lock.node, node, &pcoord);
+	if (ret)
+		goto out;
+
+	ret = allocate_znode(node, &pcoord, pos);
+
+      out:
+	done_load_count(&parent_load);
+	done_lh(&parent_lock);
+	return ret;
+}
+
+/* Process nodes on leaf level until unformatted node or rightmost node in the
+ * slum reached.  */
+static int handle_pos_on_formatted(flush_pos_t * pos)
+{
+	int ret;
+	lock_handle right_lock;
+	load_count right_load;
+
+	init_lh(&right_lock);
+	init_load_count(&right_load);
+
+	if (should_convert_node(pos, pos->lock.node)) {
+		ret = convert_node(pos, pos->lock.node);
+		if (ret)
+			return ret;
+	}
+
+	while (1) {
+		int expected;
+		expected = should_convert_next_node(pos);
+		ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
+				       ZNODE_WRITE_LOCK, !expected, expected);
+		if (ret) {
+			if (expected)
+				warning("edward-1495",
+				"Expected neighbor not found (ret = %d). Fsck?",
+					ret);
+			break;
+		}
+
+		/* we don't prep(allocate) nodes for flushing twice.  This can be suboptimal, or it
+		 * can be optimal.  For now we choose to live with the risk that it will
+		 * be suboptimal because it would be quite complex to code it to be
+		 * smarter. */
+		if (znode_check_flushprepped(right_lock.node)
+		    && !znode_convertible(right_lock.node)) {
+			assert("edward-1005", !should_convert_next_node(pos));
+			pos_stop(pos);
+			break;
+		}
+
+		ret = incr_load_count_znode(&right_load, right_lock.node);
+		if (ret)
+			break;
+		if (should_convert_node(pos, right_lock.node)) {
+			ret = convert_node(pos, right_lock.node);
+			if (ret)
+				break;
+			if (node_is_empty(right_lock.node)) {
+				/* node became empty after converting, repeat */
+				done_load_count(&right_load);
+				done_lh(&right_lock);
+				continue;
+			}
+		}
+
+		/* squeeze _before_ going upward. */
+		ret =
+		    squeeze_right_neighbor(pos, pos->lock.node,
+					   right_lock.node);
+		if (ret < 0)
+			break;
+
+		if (znode_check_flushprepped(right_lock.node)) {
+			if (should_convert_next_node(pos)) {
+				/* in spite of flushprepped status of the node,
+				   its right slum neighbor should be converted */
+				assert("edward-953", convert_data(pos));
+				assert("edward-954", item_convert_data(pos));
+
+				if (node_is_empty(right_lock.node)) {
+					done_load_count(&right_load);
+					done_lh(&right_lock);
+				} else
+					move_flush_pos(pos, &right_lock,
+						       &right_load, NULL);
+				continue;
+			}
+			pos_stop(pos);
+			break;
+		}
+
+		if (node_is_empty(right_lock.node)) {
+			/* repeat if right node was squeezed completely */
+			done_load_count(&right_load);
+			done_lh(&right_lock);
+			continue;
+		}
+
+		/* parent(right_lock.node) has to be processed before
+		 * (right_lock.node) due to "parent-first" allocation order. */
+		ret =
+		    check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
+							    right_lock.node);
+		if (ret)
+			break;
+		/* (re)allocate _after_ going upward */
+		ret = lock_parent_and_allocate_znode(right_lock.node, pos);
+		if (ret)
+			break;
+		if (should_terminate_squalloc(pos)) {
+			set_item_convert_count(pos, 0);
+			break;
+		}
+
+		/* advance the flush position to the right neighbor */
+		move_flush_pos(pos, &right_lock, &right_load, NULL);
+
+		ret = rapid_flush(pos);
+		if (ret)
+			break;
+	}
+	check_convert_info(pos);
+	done_load_count(&right_load);
+	done_lh(&right_lock);
+
+	/* This function indicates via pos whether to stop or go to twig or continue on current
+	 * level. */
+	return ret;
+
+}
+
+/* Process nodes on leaf level until unformatted node or rightmost node in the
+ * slum reached.  */
+static int handle_pos_on_leaf(flush_pos_t * pos)
+{
+	int ret;
+
+	assert("zam-845", pos->state == POS_ON_LEAF);
+
+	ret = handle_pos_on_formatted(pos);
+
+	if (ret == -E_NO_NEIGHBOR) {
+		/* cannot get right neighbor, go process extents. */
+		pos->state = POS_TO_TWIG;
+		return 0;
+	}
+
+	return ret;
+}
+
+/* Process slum on level > 1 */
+static int handle_pos_on_internal(flush_pos_t * pos)
+{
+	assert("zam-850", pos->state == POS_ON_INTERNAL);
+	return handle_pos_on_formatted(pos);
+}
+
+/* check whether squalloc should stop before processing given extent */
+static int squalloc_extent_should_stop(flush_pos_t * pos)
+{
+	assert("zam-869", item_is_extent(&pos->coord));
+
+	/* pos->child is a jnode handle_pos_on_extent() should start with in
+	 * stead of the first child of the first extent unit. */
+	if (pos->child) {
+		int prepped;
+
+		assert("vs-1383", jnode_is_unformatted(pos->child));
+		prepped = jnode_check_flushprepped(pos->child);
+		pos->pos_in_unit =
+		    jnode_get_index(pos->child) -
+		    extent_unit_index(&pos->coord);
+		assert("vs-1470",
+		       pos->pos_in_unit < extent_unit_width(&pos->coord));
+		assert("nikita-3434",
+		       ergo(extent_is_unallocated(&pos->coord),
+			    pos->pos_in_unit == 0));
+		jput(pos->child);
+		pos->child = NULL;
+
+		return prepped;
+	}
+
+	pos->pos_in_unit = 0;
+	if (extent_is_unallocated(&pos->coord))
+		return 0;
+
+	return leftmost_child_of_unit_check_flushprepped(&pos->coord);
+}
+
+/* Handle the case when regular reiser4 tree (znodes connected one to its
+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
+ * unformatted nodes.  By having a lock on twig level and use extent code
+ * routines to process unformatted nodes we swim around an irregular part of
+ * reiser4 tree. */
+static int handle_pos_on_twig(flush_pos_t * pos)
+{
+	int ret;
+
+	assert("zam-844", pos->state == POS_ON_EPOINT);
+	assert("zam-843", item_is_extent(&pos->coord));
+
+	/* We decide should we continue slum processing with current extent
+	   unit: if leftmost child of current extent unit is flushprepped
+	   (i.e. clean or already processed by flush) we stop squalloc().  There
+	   is a fast check for unallocated extents which we assume contain all
+	   not flushprepped nodes. */
+	/* FIXME: Here we implement simple check, we are only looking on the
+	   leftmost child. */
+	ret = squalloc_extent_should_stop(pos);
+	if (ret != 0) {
+		pos_stop(pos);
+		return ret;
+	}
+
+	while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
+	       && item_is_extent(&pos->coord)) {
+		ret = reiser4_alloc_extent(pos);
+		if (ret) {
+			break;
+		}
+		coord_next_unit(&pos->coord);
+	}
+
+	if (coord_is_after_rightmost(&pos->coord)) {
+		pos->state = POS_END_OF_TWIG;
+		return 0;
+	}
+	if (item_is_internal(&pos->coord)) {
+		pos->state = POS_TO_LEAF;
+		return 0;
+	}
+
+	assert("zam-860", item_is_extent(&pos->coord));
+
+	/* "slum" is over */
+	pos->state = POS_INVALID;
+	return 0;
+}
+
+/* When we about to return flush position from twig to leaf level we can process
+ * the right twig node or move position to the leaf.  This processes right twig
+ * if it is possible and jump to leaf level if not. */
+static int handle_pos_end_of_twig(flush_pos_t * pos)
+{
+	int ret;
+	lock_handle right_lock;
+	load_count right_load;
+	coord_t at_right;
+	jnode *child = NULL;
+
+	assert("zam-848", pos->state == POS_END_OF_TWIG);
+	assert("zam-849", coord_is_after_rightmost(&pos->coord));
+
+	init_lh(&right_lock);
+	init_load_count(&right_load);
+
+	/* We get a lock on the right twig node even it is not dirty because
+	 * slum continues or discontinues on leaf level not on next twig. This
+	 * lock on the right twig is needed for getting its leftmost child. */
+	ret =
+	    reiser4_get_right_neighbor(&right_lock, pos->lock.node,
+				       ZNODE_WRITE_LOCK, GN_SAME_ATOM);
+	if (ret)
+		goto out;
+
+	ret = incr_load_count_znode(&right_load, right_lock.node);
+	if (ret)
+		goto out;
+
+	/* right twig could be not dirty */
+	if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
+		/* If right twig node is dirty we always attempt to squeeze it
+		 * content to the left... */
+	      became_dirty:
+		ret =
+		    squeeze_right_twig_and_advance_coord(pos, right_lock.node);
+		if (ret <= 0) {
+			/* pos->coord is on internal item, go to leaf level, or
+			 * we have an error which will be caught in squalloc() */
+			pos->state = POS_TO_LEAF;
+			goto out;
+		}
+
+		/* If right twig was squeezed completely we wave to re-lock
+		 * right twig. now it is done through the top-level squalloc
+		 * routine. */
+		if (node_is_empty(right_lock.node))
+			goto out;
+
+		/* ... and prep it if it is not yet prepped */
+		if (!znode_check_flushprepped(right_lock.node)) {
+			/* As usual, process parent before ... */
+			ret =
+			    check_parents_and_squalloc_upper_levels(pos,
+								    pos->lock.
+								    node,
+								    right_lock.
+								    node);
+			if (ret)
+				goto out;
+
+			/* ... processing the child */
+			ret =
+			    lock_parent_and_allocate_znode(right_lock.node,
+							   pos);
+			if (ret)
+				goto out;
+		}
+	} else {
+		coord_init_first_unit(&at_right, right_lock.node);
+
+		/* check first child of next twig, should we continue there ? */
+		ret = get_leftmost_child_of_unit(&at_right, &child);
+		if (ret || child == NULL || jnode_check_flushprepped(child)) {
+			pos_stop(pos);
+			goto out;
+		}
+
+		/* check clean twig for possible relocation */
+		if (!znode_check_flushprepped(right_lock.node)) {
+			ret =
+			    reverse_relocate_check_dirty_parent(child,
+								&at_right, pos);
+			if (ret)
+				goto out;
+			if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
+				goto became_dirty;
+		}
+	}
+
+	assert("zam-875", znode_check_flushprepped(right_lock.node));
+
+	/* Update the preceder by a block number of just processed right twig
+	 * node. The code above could miss the preceder updating because
+	 * allocate_znode() could not be called for this node. */
+	pos->preceder.blk = *znode_get_block(right_lock.node);
+	check_preceder(pos->preceder.blk);
+
+	coord_init_first_unit(&at_right, right_lock.node);
+	assert("zam-868", coord_is_existing_unit(&at_right));
+
+	pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
+	move_flush_pos(pos, &right_lock, &right_load, &at_right);
+
+      out:
+	done_load_count(&right_load);
+	done_lh(&right_lock);
+
+	if (child)
+		jput(child);
+
+	return ret;
+}
+
+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
+ * continue there. */
+static int handle_pos_to_leaf(flush_pos_t * pos)
+{
+	int ret;
+	lock_handle child_lock;
+	load_count child_load;
+	jnode *child;
+
+	assert("zam-846", pos->state == POS_TO_LEAF);
+	assert("zam-847", item_is_internal(&pos->coord));
+
+	init_lh(&child_lock);
+	init_load_count(&child_load);
+
+	ret = get_leftmost_child_of_unit(&pos->coord, &child);
+	if (ret)
+		return ret;
+	if (child == NULL) {
+		pos_stop(pos);
+		return 0;
+	}
+
+	if (jnode_check_flushprepped(child)) {
+		pos->state = POS_INVALID;
+		goto out;
+	}
+
+	ret =
+	    longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
+				ZNODE_LOCK_LOPRI);
+	if (ret)
+		goto out;
+
+	ret = incr_load_count_znode(&child_load, JZNODE(child));
+	if (ret)
+		goto out;
+
+	ret = allocate_znode(JZNODE(child), &pos->coord, pos);
+	if (ret)
+		goto out;
+
+	/* move flush position to leaf level */
+	pos->state = POS_ON_LEAF;
+	move_flush_pos(pos, &child_lock, &child_load, NULL);
+
+	if (node_is_empty(JZNODE(child))) {
+		ret = delete_empty_node(JZNODE(child));
+		pos->state = POS_INVALID;
+	}
+      out:
+	done_load_count(&child_load);
+	done_lh(&child_lock);
+	jput(child);
+
+	return ret;
+}
+
+/* move pos from leaf to twig, and move lock from leaf to twig. */
+/* Move pos->lock to upper (twig) level */
+static int handle_pos_to_twig(flush_pos_t * pos)
+{
+	int ret;
+
+	lock_handle parent_lock;
+	load_count parent_load;
+	coord_t pcoord;
+
+	assert("zam-852", pos->state == POS_TO_TWIG);
+
+	init_lh(&parent_lock);
+	init_load_count(&parent_load);
+
+	ret =
+	    reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
+	if (ret)
+		goto out;
+
+	ret = incr_load_count_znode(&parent_load, parent_lock.node);
+	if (ret)
+		goto out;
+
+	ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
+	if (ret)
+		goto out;
+
+	assert("zam-870", item_is_internal(&pcoord));
+	coord_next_item(&pcoord);
+
+	if (coord_is_after_rightmost(&pcoord))
+		pos->state = POS_END_OF_TWIG;
+	else if (item_is_extent(&pcoord))
+		pos->state = POS_ON_EPOINT;
+	else {
+		/* Here we understand that getting -E_NO_NEIGHBOR in
+		 * handle_pos_on_leaf() was because of just a reaching edge of
+		 * slum */
+		pos_stop(pos);
+		goto out;
+	}
+
+	move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
+
+      out:
+	done_load_count(&parent_load);
+	done_lh(&parent_lock);
+
+	return ret;
+}
+
+typedef int (*pos_state_handle_t) (flush_pos_t *);
+static pos_state_handle_t flush_pos_handlers[] = {
+	/* process formatted nodes on leaf level, keep lock on a leaf node */
+	[POS_ON_LEAF] = handle_pos_on_leaf,
+	/* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
+	 * being processed */
+	[POS_ON_EPOINT] = handle_pos_on_twig,
+	/* move a lock from leaf node to its parent for further processing of unformatted nodes */
+	[POS_TO_TWIG] = handle_pos_to_twig,
+	/* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
+	 * pos->coord points to the leaf node we jump to */
+	[POS_TO_LEAF] = handle_pos_to_leaf,
+	/* after processing last extent in the twig node, attempting to shift items from the twigs
+	 * right neighbor and process them while shifting */
+	[POS_END_OF_TWIG] = handle_pos_end_of_twig,
+	/* process formatted nodes on internal level, keep lock on an internal node */
+	[POS_ON_INTERNAL] = handle_pos_on_internal
+};
+
+/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
+ * encrypt) nodes and their ancestors in "parent-first" order */
+static int squalloc(flush_pos_t * pos)
+{
+	int ret = 0;
+
+	/* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
+	 * greater CPU efficiency? Measure and see.... -Hans */
+	while (pos_valid(pos)) {
+		ret = flush_pos_handlers[pos->state] (pos);
+		if (ret < 0)
+			break;
+
+		ret = rapid_flush(pos);
+		if (ret)
+			break;
+	}
+
+	/* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
+	   routines, -E_NO_NEIGHBOR means that slum edge was reached */
+	if (ret > 0 || ret == -E_NO_NEIGHBOR)
+		ret = 0;
+
+	return ret;
+}
+
+static void update_ldkey(znode * node)
+{
+	reiser4_key ldkey;
+
+	assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
+	if (node_is_empty(node))
+		return;
+
+	znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
+}
+
+/* this is to be called after calling of shift node's method to shift data from @right to
+   @left. It sets left delimiting keys of @left and @right to keys of first items of @left
+   and @right correspondingly and sets right delimiting key of @left to first key of @right */
+static void update_znode_dkeys(znode * left, znode * right)
+{
+	assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
+	assert("vs-1629", (znode_is_write_locked(left) &&
+			   znode_is_write_locked(right)));
+
+	/* we need to update left delimiting of left if it was empty before shift */
+	update_ldkey(left);
+	update_ldkey(right);
+	if (node_is_empty(right))
+		znode_set_rd_key(left, znode_get_rd_key(right));
+	else
+		znode_set_rd_key(left, znode_get_ld_key(right));
+}
+
+/* try to shift everything from @right to @left. If everything was shifted -
+   @right is removed from the tree.  Result is the number of bytes shifted. */
+static int
+shift_everything_left(znode * right, znode * left, carry_level * todo)
+{
+	coord_t from;
+	node_plugin *nplug;
+	carry_plugin_info info;
+
+	coord_init_after_last_item(&from, right);
+
+	nplug = node_plugin_by_node(right);
+	info.doing = NULL;
+	info.todo = todo;
+	return nplug->shift(&from, left, SHIFT_LEFT,
+			    1 /* delete @right if it becomes empty */ ,
+			    1
+			    /* move coord @from to node @left if everything will be shifted */
+			    ,
+			    &info);
+}
+
+/* Shift as much as possible from @right to @left using the memcpy-optimized
+   shift_everything_left.  @left and @right are formatted neighboring nodes on
+   leaf level. */
+static int squeeze_right_non_twig(znode * left, znode * right)
+{
+	int ret;
+	carry_pool *pool;
+	carry_level *todo;
+
+	assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
+
+	if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
+	    !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
+		return SQUEEZE_TARGET_FULL;
+
+	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+	todo = (carry_level *) (pool + 1);
+	init_carry_level(todo, pool);
+
+	ret = shift_everything_left(right, left, todo);
+	if (ret > 0) {
+		/* something was shifted */
+		reiser4_tree *tree;
+		__u64 grabbed;
+
+		znode_make_dirty(left);
+		znode_make_dirty(right);
+
+		/* update delimiting keys of nodes which participated in
+		   shift. FIXME: it would be better to have this in shift
+		   node's operation. But it can not be done there. Nobody
+		   remembers why, though */
+		tree = znode_get_tree(left);
+		write_lock_dk(tree);
+		update_znode_dkeys(left, right);
+		write_unlock_dk(tree);
+
+		/* Carry is called to update delimiting key and, maybe, to remove empty
+		   node. */
+		grabbed = get_current_context()->grabbed_blocks;
+		ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
+		assert("nikita-3003", ret == 0);	/* reserved space is exhausted. Ask Hans. */
+		ret = reiser4_carry(todo, NULL /* previous level */ );
+		grabbed2free_mark(grabbed);
+	} else {
+		/* Shifting impossible, we return appropriate result code */
+		ret =
+		    node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
+		    SQUEEZE_TARGET_FULL;
+	}
+
+	done_carry_pool(pool);
+
+	return ret;
+}
+
+#if REISER4_DEBUG
+static int sibling_link_is_ok(const znode *left, const znode *right)
+{
+	int result;
+
+	read_lock_tree(znode_get_tree(left));
+	result = (left->right == right && left == right->left);
+	read_unlock_tree(znode_get_tree(left));
+	return result;
+}
+#endif
+
+/* Shift first unit of first item if it is an internal one.  Return
+   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
+   SUBTREE_MOVED. */
+static int shift_one_internal_unit(znode * left, znode * right)
+{
+	int ret;
+	carry_pool *pool;
+	carry_level *todo;
+	coord_t *coord;
+	carry_plugin_info *info;
+	int size, moved;
+
+	assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
+	assert("nikita-2435", znode_is_write_locked(left));
+	assert("nikita-2436", znode_is_write_locked(right));
+	assert("nikita-2434", sibling_link_is_ok(left, right));
+
+	pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
+			       sizeof(*coord) + sizeof(*info)
+#if REISER4_DEBUG
+			       + sizeof(*coord) + 2 * sizeof(reiser4_key)
+#endif
+	    );
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+	todo = (carry_level *) (pool + 1);
+	init_carry_level(todo, pool);
+
+	coord = (coord_t *) (todo + 3);
+	coord_init_first_unit(coord, right);
+	info = (carry_plugin_info *) (coord + 1);
+
+#if REISER4_DEBUG
+	if (!node_is_empty(left)) {
+		coord_t *last;
+		reiser4_key *right_key;
+		reiser4_key *left_key;
+
+		last = (coord_t *) (info + 1);
+		right_key = (reiser4_key *) (last + 1);
+		left_key = right_key + 1;
+		coord_init_last_unit(last, left);
+
+		assert("nikita-2463",
+		       keyle(item_key_by_coord(last, left_key),
+			     item_key_by_coord(coord, right_key)));
+	}
+#endif
+
+	assert("jmacd-2007", item_is_internal(coord));
+
+	size = item_length_by_coord(coord);
+	info->todo = todo;
+	info->doing = NULL;
+
+	ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
+					       1
+					       /* delete @right if it becomes empty */
+					       ,
+					       0
+					       /* do not move coord @coord to node @left */
+					       ,
+					       info);
+
+	/* If shift returns positive, then we shifted the item. */
+	assert("vs-423", ret <= 0 || size == ret);
+	moved = (ret > 0);
+
+	if (moved) {
+		/* something was moved */
+		reiser4_tree *tree;
+		int grabbed;
+
+		znode_make_dirty(left);
+		znode_make_dirty(right);
+		tree = znode_get_tree(left);
+		write_lock_dk(tree);
+		update_znode_dkeys(left, right);
+		write_unlock_dk(tree);
+
+		/* reserve space for delimiting keys after shifting */
+		grabbed = get_current_context()->grabbed_blocks;
+		ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
+		assert("nikita-3003", ret == 0);	/* reserved space is exhausted. Ask Hans. */
+
+		ret = reiser4_carry(todo, NULL /* previous level */ );
+		grabbed2free_mark(grabbed);
+	}
+
+	done_carry_pool(pool);
+
+	if (ret != 0) {
+		/* Shift or carry operation failed. */
+		assert("jmacd-7325", ret < 0);
+		return ret;
+	}
+
+	return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
+}
+
+/* Make the final relocate/wander decision during forward parent-first squalloc for a
+   znode.  For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
+static int
+allocate_znode_loaded(znode * node,
+		      const coord_t * parent_coord, flush_pos_t * pos)
+{
+	int ret;
+	reiser4_super_info_data *sbinfo = get_current_super_private();
+	/* FIXME(D): We have the node write-locked and should have checked for !
+	   allocated() somewhere before reaching this point, but there can be a race, so
+	   this assertion is bogus. */
+	assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
+	assert("jmacd-7988", znode_is_write_locked(node));
+	assert("jmacd-7989", coord_is_invalid(parent_coord)
+	       || znode_is_write_locked(parent_coord->node));
+
+	if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
+	    znode_is_root(node) ||
+	    /* We have enough nodes to relocate no matter what. */
+	    (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
+		/* No need to decide with new nodes, they are treated the same as
+		   relocate. If the root node is dirty, relocate. */
+		if (pos->preceder.blk == 0) {
+			/* preceder is unknown and we have decided to relocate node --
+			   using of default value for search start is better than search
+			   from block #0. */
+			get_blocknr_hint_default(&pos->preceder.blk);
+			check_preceder(pos->preceder.blk);
+		}
+
+		goto best_reloc;
+
+	} else if (pos->preceder.blk == 0) {
+		/* If we don't know the preceder, leave it where it is. */
+		jnode_make_wander(ZJNODE(node));
+	} else {
+		/* Make a decision based on block distance. */
+		reiser4_block_nr dist;
+		reiser4_block_nr nblk = *znode_get_block(node);
+
+		assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
+		assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
+		assert("jmacd-6174", pos->preceder.blk != 0);
+
+		if (pos->preceder.blk == nblk - 1) {
+			/* Ideal. */
+			jnode_make_wander(ZJNODE(node));
+		} else {
+
+			dist =
+			    (nblk <
+			     pos->preceder.blk) ? (pos->preceder.blk -
+						   nblk) : (nblk -
+							    pos->preceder.blk);
+
+			/* See if we can find a closer block (forward direction only). */
+			pos->preceder.max_dist =
+			    min((reiser4_block_nr) sbinfo->flush.
+				relocate_distance, dist);
+			pos->preceder.level = znode_get_level(node);
+
+			ret = allocate_znode_update(node, parent_coord, pos);
+
+			pos->preceder.max_dist = 0;
+
+			if (ret && (ret != -ENOSPC))
+				return ret;
+
+			if (ret == 0) {
+				/* Got a better allocation. */
+				znode_make_reloc(node, pos->fq);
+			} else if (dist < sbinfo->flush.relocate_distance) {
+				/* The present allocation is good enough. */
+				jnode_make_wander(ZJNODE(node));
+			} else {
+				/* Otherwise, try to relocate to the best position. */
+			      best_reloc:
+				ret =
+				    allocate_znode_update(node, parent_coord,
+							  pos);
+				if (ret != 0)
+					return ret;
+
+				/* set JNODE_RELOC bit _after_ node gets allocated */
+				znode_make_reloc(node, pos->fq);
+			}
+		}
+	}
+
+	/* This is the new preceder. */
+	pos->preceder.blk = *znode_get_block(node);
+	check_preceder(pos->preceder.blk);
+	pos->alloc_cnt += 1;
+
+	assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
+
+	return 0;
+}
+
+static int
+allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
+{
+	/*
+	 * perform znode allocation with znode pinned in memory to avoid races
+	 * with asynchronous emergency flush (which plays with
+	 * JNODE_FLUSH_RESERVED bit).
+	 */
+	return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
+}
+
+/* A subroutine of allocate_znode, this is called first to see if there is a close
+   position to relocate to.  It may return ENOSPC if there is no close position.  If there
+   is no close position it may not relocate.  This takes care of updating the parent node
+   with the relocated block address. */
+static int
+allocate_znode_update(znode * node, const coord_t * parent_coord,
+		      flush_pos_t * pos)
+{
+	int ret;
+	reiser4_block_nr blk;
+	lock_handle uber_lock;
+	int flush_reserved_used = 0;
+	int grabbed;
+	reiser4_context *ctx;
+	reiser4_super_info_data *sbinfo;
+
+	init_lh(&uber_lock);
+
+	ctx = get_current_context();
+	sbinfo = get_super_private(ctx->super);
+
+	grabbed = ctx->grabbed_blocks;
+
+	/* discard e-flush allocation */
+	ret = zload(node);
+	if (ret)
+		return ret;
+
+	if (ZF_ISSET(node, JNODE_CREATED)) {
+		assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
+		pos->preceder.block_stage = BLOCK_UNALLOCATED;
+	} else {
+		pos->preceder.block_stage = BLOCK_GRABBED;
+
+		/* The disk space for relocating the @node is already reserved in "flush reserved"
+		 * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
+		 * space from whole disk not from only 95%). */
+		if (znode_get_level(node) == LEAF_LEVEL) {
+			/*
+			 * earlier (during do_jnode_make_dirty()) we decided
+			 * that @node can possibly go into overwrite set and
+			 * reserved block for its wandering location.
+			 */
+			txn_atom *atom = get_current_atom_locked();
+			assert("nikita-3449",
+			       ZF_ISSET(node, JNODE_FLUSH_RESERVED));
+			flush_reserved2grabbed(atom, (__u64) 1);
+			spin_unlock_atom(atom);
+			/*
+			 * we are trying to move node into relocate
+			 * set. Allocation of relocated position "uses"
+			 * reserved block.
+			 */
+			ZF_CLR(node, JNODE_FLUSH_RESERVED);
+			flush_reserved_used = 1;
+		} else {
+			ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
+			if (ret != 0)
+				goto exit;
+		}
+	}
+
+	/* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
+	ret = reiser4_alloc_block(&pos->preceder, &blk,
+				  BA_FORMATTED | BA_PERMANENT);
+	if (ret)
+		goto exit;
+
+	if (!ZF_ISSET(node, JNODE_CREATED) &&
+	    (ret =
+	     reiser4_dealloc_block(znode_get_block(node), 0,
+				   BA_DEFER | BA_FORMATTED)))
+		goto exit;
+
+	if (likely(!znode_is_root(node))) {
+		item_plugin *iplug;
+
+		iplug = item_plugin_by_coord(parent_coord);
+		assert("nikita-2954", iplug->f.update != NULL);
+		iplug->f.update(parent_coord, &blk);
+
+		znode_make_dirty(parent_coord->node);
+
+	} else {
+		reiser4_tree *tree = znode_get_tree(node);
+		znode *uber;
+
+		/* We take a longterm lock on the fake node in order to change
+		   the root block number.  This may cause atom fusion. */
+		ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
+				     &uber_lock);
+		/* The fake node cannot be deleted, and we must have priority
+		   here, and may not be confused with ENOSPC. */
+		assert("jmacd-74412",
+		       ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
+
+		if (ret)
+			goto exit;
+
+		uber = uber_lock.node;
+
+		write_lock_tree(tree);
+		tree->root_block = blk;
+		write_unlock_tree(tree);
+
+		znode_make_dirty(uber);
+	}
+
+	ret = znode_rehash(node, &blk);
+      exit:
+	if (ret) {
+		/* Get flush reserved block back if something fails, because
+		 * callers assume that on error block wasn't relocated and its
+		 * flush reserved block wasn't used. */
+		if (flush_reserved_used) {
+			/*
+			 * ok, we failed to move node into relocate
+			 * set. Restore status quo.
+			 */
+			grabbed2flush_reserved((__u64) 1);
+			ZF_SET(node, JNODE_FLUSH_RESERVED);
+		}
+	}
+	zrelse(node);
+	done_lh(&uber_lock);
+	grabbed2free_mark(grabbed);
+	return ret;
+}
+
+/* JNODE INTERFACE */
+
+/* Lock a node (if formatted) and then get its parent locked, set the child's
+   coordinate in the parent.  If the child is the root node, the above_root
+   znode is returned but the coord is not set.  This function may cause atom
+   fusion, but it is only used for read locks (at this point) and therefore
+   fusion only occurs when the parent is already dirty. */
+/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
+   pointer in jnodes. */
+static int
+jnode_lock_parent_coord(jnode * node,
+			coord_t * coord,
+			lock_handle * parent_lh,
+			load_count * parent_zh,
+			znode_lock_mode parent_mode, int try)
+{
+	int ret;
+
+	assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
+	assert("edward-54", jnode_is_unformatted(node)
+	       || znode_is_any_locked(JZNODE(node)));
+
+	if (!jnode_is_znode(node)) {
+		reiser4_key key;
+		tree_level stop_level = TWIG_LEVEL;
+		lookup_bias bias = FIND_EXACT;
+
+		assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
+
+		/* The case when node is not znode, but can have parent coord
+		   (unformatted node, node which represents cluster page,
+		   etc..).  Generate a key for the appropriate entry, search
+		   in the tree using coord_by_key, which handles locking for
+		   us. */
+
+		/*
+		 * nothing is locked at this moment, so, nothing prevents
+		 * concurrent truncate from removing jnode from inode. To
+		 * prevent this spin-lock jnode. jnode can be truncated just
+		 * after call to the jnode_build_key(), but this is ok,
+		 * because coord_by_key() will just fail to find appropriate
+		 * extent.
+		 */
+		spin_lock_jnode(node);
+		if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+			jnode_build_key(node, &key);
+			ret = 0;
+		} else
+			ret = RETERR(-ENOENT);
+		spin_unlock_jnode(node);
+
+		if (ret != 0)
+			return ret;
+
+		if (jnode_is_cluster_page(node))
+			stop_level = LEAF_LEVEL;
+
+		assert("jmacd-1812", coord != NULL);
+
+		ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
+				   parent_mode, bias, stop_level, stop_level,
+				   CBK_UNIQUE, NULL /*ra_info */ );
+		switch (ret) {
+		case CBK_COORD_NOTFOUND:
+			assert("edward-1038",
+			       ergo(jnode_is_cluster_page(node),
+				    JF_ISSET(node, JNODE_HEARD_BANSHEE)));
+			if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
+				warning("nikita-3177", "Parent not found");
+			return ret;
+		case CBK_COORD_FOUND:
+			if (coord->between != AT_UNIT) {
+				/* FIXME: comment needed */
+				done_lh(parent_lh);
+				if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+					warning("nikita-3178",
+						"Found but not happy: %i",
+						coord->between);
+				}
+				return RETERR(-ENOENT);
+			}
+			ret = incr_load_count_znode(parent_zh, parent_lh->node);
+			if (ret != 0)
+				return ret;
+			/* if (jnode_is_cluster_page(node)) {
+			   races with write() are possible
+			   check_child_cluster (parent_lh->node);
+			   }
+			 */
+			break;
+		default:
+			return ret;
+		}
+
+	} else {
+		int flags;
+		znode *z;
+
+		z = JZNODE(node);
+		/* Formatted node case: */
+		assert("jmacd-2061", !znode_is_root(z));
+
+		flags = GN_ALLOW_NOT_CONNECTED;
+		if (try)
+			flags |= GN_TRY_LOCK;
+
+		ret =
+		    reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
+		if (ret != 0)
+			/* -E_REPEAT is ok here, it is handled by the caller. */
+			return ret;
+
+		/* Make the child's position "hint" up-to-date.  (Unless above
+		   root, which caller must check.) */
+		if (coord != NULL) {
+
+			ret = incr_load_count_znode(parent_zh, parent_lh->node);
+			if (ret != 0) {
+				warning("jmacd-976812386",
+					"incr_load_count_znode failed: %d",
+					ret);
+				return ret;
+			}
+
+			ret = find_child_ptr(parent_lh->node, z, coord);
+			if (ret != 0) {
+				warning("jmacd-976812",
+					"find_child_ptr failed: %d", ret);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
+   If there is no next neighbor or the neighbor is not in memory or if there is a
+   neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
+   In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
+static int neighbor_in_slum(znode * node,	/* starting point */
+			    lock_handle * lock,	/* lock on starting point */
+			    sideof side,	/* left or right direction we seek the next node in */
+			    znode_lock_mode mode,	/* kind of lock we want */
+			    int check_dirty, /* true if the neighbor should be dirty */
+			    int use_upper_levels /* get neighbor by going though
+						    upper levels */)
+{
+	int ret;
+	int flags;
+
+	assert("jmacd-6334", znode_is_connected(node));
+
+	flags =  GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
+	if (use_upper_levels)
+		flags |= GN_CAN_USE_UPPER_LEVELS;
+
+	ret = reiser4_get_neighbor(lock, node, mode, flags);
+	if (ret) {
+		/* May return -ENOENT or -E_NO_NEIGHBOR. */
+		/* FIXME(C): check EINVAL, E_DEADLOCK */
+		if (ret == -ENOENT) {
+			ret = RETERR(-E_NO_NEIGHBOR);
+		}
+		return ret;
+	}
+	if (!check_dirty)
+		return 0;
+	/* Check dirty bit of locked znode, no races here */
+	if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
+		return 0;
+
+	done_lh(lock);
+	return RETERR(-E_NO_NEIGHBOR);
+}
+
+/* Return true if two znodes have the same parent.  This is called with both nodes
+   write-locked (for squeezing) so no tree lock is needed. */
+static int znode_same_parents(znode * a, znode * b)
+{
+	int result;
+
+	assert("jmacd-7011", znode_is_write_locked(a));
+	assert("jmacd-7012", znode_is_write_locked(b));
+
+	/* We lock the whole tree for this check.... I really don't like whole tree
+	 * locks... -Hans */
+	read_lock_tree(znode_get_tree(a));
+	result = (znode_parent(a) == znode_parent(b));
+	read_unlock_tree(znode_get_tree(a));
+	return result;
+}
+
+/* FLUSH SCAN */
+
+/* Initialize the flush_scan data structure. */
+static void scan_init(flush_scan * scan)
+{
+	memset(scan, 0, sizeof(*scan));
+	init_lh(&scan->node_lock);
+	init_lh(&scan->parent_lock);
+	init_load_count(&scan->parent_load);
+	init_load_count(&scan->node_load);
+	coord_init_invalid(&scan->parent_coord, NULL);
+}
+
+/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
+static void scan_done(flush_scan * scan)
+{
+	done_load_count(&scan->node_load);
+	if (scan->node != NULL) {
+		jput(scan->node);
+		scan->node = NULL;
+	}
+	done_load_count(&scan->parent_load);
+	done_lh(&scan->parent_lock);
+	done_lh(&scan->node_lock);
+}
+
+/* Returns true if flush scanning is finished. */
+int reiser4_scan_finished(flush_scan * scan)
+{
+	return scan->stop || (scan->direction == RIGHT_SIDE &&
+			      scan->count >= scan->max_count);
+}
+
+/* Return true if the scan should continue to the @tonode.  True if the node meets the
+   same_slum_check condition.  If not, deref the "left" node and stop the scan. */
+int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
+{
+	int go = same_slum_check(scan->node, tonode, 1, 0);
+
+	if (!go) {
+		scan->stop = 1;
+		jput(tonode);
+	}
+
+	return go;
+}
+
+/* Set the current scan->node, refcount it, increment count by the @add_count (number to
+   count, e.g., skipped unallocated nodes), deref previous current, and copy the current
+   parent coordinate. */
+int
+scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
+		 const coord_t * parent)
+{
+	/* Release the old references, take the new reference. */
+	done_load_count(&scan->node_load);
+
+	if (scan->node != NULL) {
+		jput(scan->node);
+	}
+	scan->node = node;
+	scan->count += add_count;
+
+	/* This next stmt is somewhat inefficient.  The reiser4_scan_extent() code could
+	   delay this update step until it finishes and update the parent_coord only once.
+	   It did that before, but there was a bug and this was the easiest way to make it
+	   correct. */
+	if (parent != NULL) {
+		coord_dup(&scan->parent_coord, parent);
+	}
+
+	/* Failure may happen at the incr_load_count call, but the caller can assume the reference
+	   is safely taken. */
+	return incr_load_count_jnode(&scan->node_load, node);
+}
+
+/* Return true if scanning in the leftward direction. */
+int reiser4_scanning_left(flush_scan * scan)
+{
+	return scan->direction == LEFT_SIDE;
+}
+
+/* Performs leftward scanning starting from either kind of node.  Counts the starting
+   node.  The right-scan object is passed in for the left-scan in order to copy the parent
+   of an unformatted starting position.  This way we avoid searching for the unformatted
+   node's parent when scanning in each direction.  If we search for the parent once it is
+   set in both scan objects.  The limit parameter tells flush-scan when to stop.
+
+   Rapid scanning is used only during scan_left, where we are interested in finding the
+   'leftpoint' where we begin flushing.  We are interested in stopping at the left child
+   of a twig that does not have a dirty left neighbor.  THIS IS A SPECIAL CASE.  The
+   problem is finding a way to flush only those nodes without unallocated children, and it
+   is difficult to solve in the bottom-up flushing algorithm we are currently using.  The
+   problem can be solved by scanning left at every level as we go upward, but this would
+   basically bring us back to using a top-down allocation strategy, which we already tried
+   (see BK history from May 2002), and has a different set of problems.  The top-down
+   strategy makes avoiding unallocated children easier, but makes it difficult to
+   propertly flush dirty children with clean parents that would otherwise stop the
+   top-down flush, only later to dirty the parent once the children are flushed.  So we
+   solve the problem in the bottom-up algorithm with a special case for twigs and leaves
+   only.
+
+   The first step in solving the problem is this rapid leftward scan.  After we determine
+   that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
+   are no longer interested in the exact count, we are only interested in finding a the
+   best place to start the flush.  We could choose one of two possibilities:
+
+   1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
+   This requires checking one leaf per rapid-scan twig
+
+   2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
+   to the left.  This requires checking possibly all of the in-memory children of each
+   twig during the rapid scan.
+
+   For now we implement the first policy.
+*/
+static int
+scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
+{
+	int ret = 0;
+
+	scan->max_count = limit;
+	scan->direction = LEFT_SIDE;
+
+	ret = scan_set_current(scan, jref(node), 1, NULL);
+	if (ret != 0) {
+		return ret;
+	}
+
+	ret = scan_common(scan, right);
+	if (ret != 0) {
+		return ret;
+	}
+
+	/* Before rapid scanning, we need a lock on scan->node so that we can get its
+	   parent, only if formatted. */
+	if (jnode_is_znode(scan->node)) {
+		ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
+					  ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
+	}
+
+	/* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
+	return ret;
+}
+
+/* Performs rightward scanning... Does not count the starting node.  The limit parameter
+   is described in scan_left.  If the starting node is unformatted then the
+   parent_coord was already set during scan_left.  The rapid_after parameter is not used
+   during right-scanning.
+
+   scan_right is only called if the scan_left operation does not count at least
+   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter is set to
+   the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
+   scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
+static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
+{
+	int ret;
+
+	scan->max_count = limit;
+	scan->direction = RIGHT_SIDE;
+
+	ret = scan_set_current(scan, jref(node), 0, NULL);
+	if (ret != 0) {
+		return ret;
+	}
+
+	return scan_common(scan, NULL);
+}
+
+/* Common code to perform left or right scanning. */
+static int scan_common(flush_scan * scan, flush_scan * other)
+{
+	int ret;
+
+	assert("nikita-2376", scan->node != NULL);
+	assert("edward-54", jnode_is_unformatted(scan->node)
+	       || jnode_is_znode(scan->node));
+
+	/* Special case for starting at an unformatted node.  Optimization: we only want
+	   to search for the parent (which requires a tree traversal) once.  Obviously, we
+	   shouldn't have to call it once for the left scan and once for the right scan.
+	   For this reason, if we search for the parent during scan-left we then duplicate
+	   the coord/lock/load into the scan-right object. */
+	if (jnode_is_unformatted(scan->node)) {
+		ret = scan_unformatted(scan, other);
+		if (ret != 0)
+			return ret;
+	}
+	/* This loop expects to start at a formatted position and performs chaining of
+	   formatted regions */
+	while (!reiser4_scan_finished(scan)) {
+
+		ret = scan_formatted(scan);
+		if (ret != 0) {
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int scan_unformatted(flush_scan * scan, flush_scan * other)
+{
+	int ret = 0;
+	int try = 0;
+
+	if (!coord_is_invalid(&scan->parent_coord))
+		goto scan;
+
+	/* set parent coord from */
+	if (!jnode_is_unformatted(scan->node)) {
+		/* formatted position */
+
+		lock_handle lock;
+		assert("edward-301", jnode_is_znode(scan->node));
+		init_lh(&lock);
+
+		/*
+		 * when flush starts from unformatted node, first thing it
+		 * does is tree traversal to find formatted parent of starting
+		 * node. This parent is then kept lock across scans to the
+		 * left and to the right. This means that during scan to the
+		 * left we cannot take left-ward lock, because this is
+		 * dead-lock prone. So, if we are scanning to the left and
+		 * there is already lock held by this thread,
+		 * jnode_lock_parent_coord() should use try-lock.
+		 */
+		try = reiser4_scanning_left(scan)
+		    && !lock_stack_isclean(get_current_lock_stack());
+		/* Need the node locked to get the parent lock, We have to
+		   take write lock since there is at least one call path
+		   where this znode is already write-locked by us. */
+		ret =
+		    longterm_lock_znode(&lock, JZNODE(scan->node),
+					ZNODE_WRITE_LOCK,
+					reiser4_scanning_left(scan) ?
+					ZNODE_LOCK_LOPRI :
+					ZNODE_LOCK_HIPRI);
+		if (ret != 0)
+			/* EINVAL or E_DEADLOCK here mean... try again!  At this point we've
+			   scanned too far and can't back out, just start over. */
+			return ret;
+
+		ret = jnode_lock_parent_coord(scan->node,
+					      &scan->parent_coord,
+					      &scan->parent_lock,
+					      &scan->parent_load,
+					      ZNODE_WRITE_LOCK, try);
+
+		/* FIXME(C): check EINVAL, E_DEADLOCK */
+		done_lh(&lock);
+		if (ret == -E_REPEAT) {
+			scan->stop = 1;
+			return 0;
+		}
+		if (ret)
+			return ret;
+
+	} else {
+		/* unformatted position */
+
+		ret =
+		    jnode_lock_parent_coord(scan->node, &scan->parent_coord,
+					    &scan->parent_lock,
+					    &scan->parent_load,
+					    ZNODE_WRITE_LOCK, try);
+
+		if (IS_CBKERR(ret))
+			return ret;
+
+		if (ret == CBK_COORD_NOTFOUND)
+			/* FIXME(C): check EINVAL, E_DEADLOCK */
+			return ret;
+
+		/* parent was found */
+		assert("jmacd-8661", other != NULL);
+		/* Duplicate the reference into the other flush_scan. */
+		coord_dup(&other->parent_coord, &scan->parent_coord);
+		copy_lh(&other->parent_lock, &scan->parent_lock);
+		copy_load_count(&other->parent_load, &scan->parent_load);
+	}
+      scan:
+	return scan_by_coord(scan);
+}
+
+/* Performs left- or rightward scanning starting from a formatted node. Follow left
+   pointers under tree lock as long as:
+
+   - node->left/right is non-NULL
+   - node->left/right is connected, dirty
+   - node->left/right belongs to the same atom
+   - scan has not reached maximum count
+*/
+static int scan_formatted(flush_scan * scan)
+{
+	int ret;
+	znode *neighbor = NULL;
+
+	assert("jmacd-1401", !reiser4_scan_finished(scan));
+
+	do {
+		znode *node = JZNODE(scan->node);
+
+		/* Node should be connected, but if not stop the scan. */
+		if (!znode_is_connected(node)) {
+			scan->stop = 1;
+			break;
+		}
+
+		/* Lock the tree, check-for and reference the next sibling. */
+		read_lock_tree(znode_get_tree(node));
+
+		/* It may be that a node is inserted or removed between a node and its
+		   left sibling while the tree lock is released, but the flush-scan count
+		   does not need to be precise.  Thus, we release the tree lock as soon as
+		   we get the neighboring node. */
+		neighbor =
+			reiser4_scanning_left(scan) ? node->left : node->right;
+		if (neighbor != NULL) {
+			zref(neighbor);
+		}
+
+		read_unlock_tree(znode_get_tree(node));
+
+		/* If neighbor is NULL at the leaf level, need to check for an unformatted
+		   sibling using the parent--break in any case. */
+		if (neighbor == NULL) {
+			break;
+		}
+
+		/* Check the condition for going left, break if it is not met.  This also
+		   releases (jputs) the neighbor if false. */
+		if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
+			break;
+		}
+
+		/* Advance the flush_scan state to the left, repeat. */
+		ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
+		if (ret != 0) {
+			return ret;
+		}
+
+	} while (!reiser4_scan_finished(scan));
+
+	/* If neighbor is NULL then we reached the end of a formatted region, or else the
+	   sibling is out of memory, now check for an extent to the left (as long as
+	   LEAF_LEVEL). */
+	if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
+	    || reiser4_scan_finished(scan)) {
+		scan->stop = 1;
+		return 0;
+	}
+	/* Otherwise, calls scan_by_coord for the right(left)most item of the
+	   left(right) neighbor on the parent level, then possibly continue. */
+
+	coord_init_invalid(&scan->parent_coord, NULL);
+	return scan_unformatted(scan, NULL);
+}
+
+/* NOTE-EDWARD:
+   This scans adjacent items of the same type and calls scan flush plugin for each one.
+   Performs left(right)ward scanning starting from a (possibly) unformatted node.  If we start
+   from unformatted node, then we continue only if the next neighbor is also unformatted.
+   When called from scan_formatted, we skip first iteration (to make sure that
+   right(left)most item of the left(right) neighbor on the parent level is of the same
+   type and set appropriate coord). */
+static int scan_by_coord(flush_scan * scan)
+{
+	int ret = 0;
+	int scan_this_coord;
+	lock_handle next_lock;
+	load_count next_load;
+	coord_t next_coord;
+	jnode *child;
+	item_plugin *iplug;
+
+	init_lh(&next_lock);
+	init_load_count(&next_load);
+	scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
+
+	/* set initial item id */
+	iplug = item_plugin_by_coord(&scan->parent_coord);
+
+	for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
+		if (scan_this_coord) {
+			/* Here we expect that unit is scannable. it would not be so due
+			 * to race with extent->tail conversion.  */
+			if (iplug->f.scan == NULL) {
+				scan->stop = 1;
+				ret = -E_REPEAT;
+				/* skip the check at the end. */
+				goto race;
+			}
+
+			ret = iplug->f.scan(scan);
+			if (ret != 0)
+				goto exit;
+
+			if (reiser4_scan_finished(scan)) {
+				checkchild(scan);
+				break;
+			}
+		} else {
+			/* the same race against truncate as above is possible
+			 * here, it seems */
+
+			/* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
+			   the first coordinate. */
+			assert("jmacd-1231",
+			       item_is_internal(&scan->parent_coord));
+		}
+
+		if (iplug->f.utmost_child == NULL
+		    || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
+			/* stop this coord and continue on parrent level */
+			ret =
+			    scan_set_current(scan,
+					     ZJNODE(zref
+						    (scan->parent_coord.node)),
+					     1, NULL);
+			if (ret != 0)
+				goto exit;
+			break;
+		}
+
+		/* Either way, the invariant is that scan->parent_coord is set to the
+		   parent of scan->node. Now get the next unit. */
+		coord_dup(&next_coord, &scan->parent_coord);
+		coord_sideof_unit(&next_coord, scan->direction);
+
+		/* If off-the-end of the twig, try the next twig. */
+		if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
+			/* We take the write lock because we may start flushing from this
+			 * coordinate. */
+			ret = neighbor_in_slum(next_coord.node,
+					       &next_lock,
+					       scan->direction,
+					       ZNODE_WRITE_LOCK,
+					       1 /* check dirty */,
+					       0 /* don't go though upper
+						    levels */);
+			if (ret == -E_NO_NEIGHBOR) {
+				scan->stop = 1;
+				ret = 0;
+				break;
+			}
+
+			if (ret != 0) {
+				goto exit;
+			}
+
+			ret = incr_load_count_znode(&next_load, next_lock.node);
+			if (ret != 0) {
+				goto exit;
+			}
+
+			coord_init_sideof_unit(&next_coord, next_lock.node,
+					       sideof_reverse(scan->direction));
+		}
+
+		iplug = item_plugin_by_coord(&next_coord);
+
+		/* Get the next child. */
+		ret =
+		    iplug->f.utmost_child(&next_coord,
+					  sideof_reverse(scan->direction),
+					  &child);
+		if (ret != 0)
+			goto exit;
+		/* If the next child is not in memory, or, item_utmost_child
+		   failed (due to race with unlink, most probably), stop
+		   here. */
+		if (child == NULL || IS_ERR(child)) {
+			scan->stop = 1;
+			checkchild(scan);
+			break;
+		}
+
+		assert("nikita-2374", jnode_is_unformatted(child)
+		       || jnode_is_znode(child));
+
+		/* See if it is dirty, part of the same atom. */
+		if (!reiser4_scan_goto(scan, child)) {
+			checkchild(scan);
+			break;
+		}
+
+		/* If so, make this child current. */
+		ret = scan_set_current(scan, child, 1, &next_coord);
+		if (ret != 0)
+			goto exit;
+
+		/* Now continue.  If formatted we release the parent lock and return, then
+		   proceed. */
+		if (jnode_is_znode(child))
+			break;
+
+		/* Otherwise, repeat the above loop with next_coord. */
+		if (next_load.node != NULL) {
+			done_lh(&scan->parent_lock);
+			move_lh(&scan->parent_lock, &next_lock);
+			move_load_count(&scan->parent_load, &next_load);
+		}
+	}
+
+	assert("jmacd-6233",
+	       reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
+      exit:
+	checkchild(scan);
+      race:			/* skip the above check  */
+	if (jnode_is_znode(scan->node)) {
+		done_lh(&scan->parent_lock);
+		done_load_count(&scan->parent_load);
+	}
+
+	done_load_count(&next_load);
+	done_lh(&next_lock);
+	return ret;
+}
+
+/* FLUSH POS HELPERS */
+
+/* Initialize the fields of a flush_position. */
+static void pos_init(flush_pos_t * pos)
+{
+	memset(pos, 0, sizeof *pos);
+
+	pos->state = POS_INVALID;
+	coord_init_invalid(&pos->coord, NULL);
+	init_lh(&pos->lock);
+	init_load_count(&pos->load);
+
+	reiser4_blocknr_hint_init(&pos->preceder);
+}
+
+/* The flush loop inside squalloc periodically checks pos_valid to
+   determine when "enough flushing" has been performed.  This will return true until one
+   of the following conditions is met:
+
+   1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
+   parameter, meaning we have flushed as many blocks as the kernel requested.  When
+   flushing to commit, this parameter is NULL.
+
+   2. pos_stop() is called because squalloc discovers that the "next" node in the
+   flush order is either non-existant, not dirty, or not in the same atom.
+*/
+
+static int pos_valid(flush_pos_t * pos)
+{
+	return pos->state != POS_INVALID;
+}
+
+/* Release any resources of a flush_position.  Called when jnode_flush finishes. */
+static void pos_done(flush_pos_t * pos)
+{
+	pos_stop(pos);
+	reiser4_blocknr_hint_done(&pos->preceder);
+	if (convert_data(pos))
+		free_convert_data(pos);
+}
+
+/* Reset the point and parent.  Called during flush subroutines to terminate the
+   squalloc loop. */
+static int pos_stop(flush_pos_t * pos)
+{
+	pos->state = POS_INVALID;
+	done_lh(&pos->lock);
+	done_load_count(&pos->load);
+	coord_init_invalid(&pos->coord, NULL);
+
+	if (pos->child) {
+		jput(pos->child);
+		pos->child = NULL;
+	}
+
+	return 0;
+}
+
+/* Return the flush_position's block allocator hint. */
+reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
+{
+	return &pos->preceder;
+}
+
+flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
+{
+	return pos->fq;
+}
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 90
+   LocalWords:  preceder
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/flush.h linux-2.6.22/fs/reiser4/flush.h
--- linux-2.6.22.orig/fs/reiser4/flush.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/flush.h	2007-07-29 00:25:34.864693371 +0400
@@ -0,0 +1,295 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* DECLARATIONS: */
+
+#if !defined(__REISER4_FLUSH_H__)
+#define __REISER4_FLUSH_H__
+
+#include "plugin/cluster.h"
+
+/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
+   single level of the tree.  A flush-scan is used for counting the number of adjacent
+   nodes to flush, which is used to determine whether we should relocate, and it is also
+   used to find a starting point for flush.  A flush-scan object can scan in both right
+   and left directions via the scan_left() and scan_right() interfaces.  The
+   right- and left-variations are similar but perform different functions.  When scanning
+   left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
+   When scanning right we are simply counting the number of adjacent, dirty nodes. */
+struct flush_scan {
+
+	/* The current number of nodes scanned on this level. */
+	unsigned count;
+
+	/* There may be a maximum number of nodes for a scan on any single level.  When
+	   going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
+	unsigned max_count;
+
+	/* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
+	sideof direction;
+
+	/* Initially @stop is set to false then set true once some condition stops the
+	   search (e.g., we found a clean node before reaching max_count or we found a
+	   node belonging to another atom). */
+	int stop;
+
+	/* The current scan position.  If @node is non-NULL then its reference count has
+	   been incremented to reflect this reference. */
+	jnode *node;
+
+	/* A handle for zload/zrelse of current scan position node. */
+	load_count node_load;
+
+	/* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
+	   node is locked using this lock handle.  The endpoint needs to be locked for
+	   transfer to the flush_position object after scanning finishes. */
+	lock_handle node_lock;
+
+	/* When the position is unformatted, its parent, coordinate, and parent
+	   zload/zrelse handle. */
+	lock_handle parent_lock;
+	coord_t parent_coord;
+	load_count parent_load;
+
+	/* The block allocator preceder hint.  Sometimes flush_scan determines what the
+	   preceder is and if so it sets it here, after which it is copied into the
+	   flush_position.  Otherwise, the preceder is computed later. */
+	reiser4_block_nr preceder_blk;
+};
+
+struct convert_item_info {
+	dc_item_stat d_cur;	/* disk cluster state of the current item */
+	dc_item_stat d_next;	/* disk cluster state of the next slum item */
+	struct inode *inode;
+	flow_t flow;
+};
+
+struct convert_info {
+	int count;		/* for squalloc terminating */
+	item_plugin *iplug;	/* current item plugin */
+	struct convert_item_info *itm;	/* current item info */
+	struct cluster_handle clust;	/* transform cluster */
+};
+
+typedef enum flush_position_state {
+	POS_INVALID,		/* Invalid or stopped pos, do not continue slum
+				 * processing */
+	POS_ON_LEAF,		/* pos points to already prepped, locked formatted node at
+				 * leaf level */
+	POS_ON_EPOINT,		/* pos keeps a lock on twig level, "coord" field is used
+				 * to traverse unformatted nodes */
+	POS_TO_LEAF,		/* pos is being moved to leaf level */
+	POS_TO_TWIG,		/* pos is being moved to twig level */
+	POS_END_OF_TWIG,	/* special case of POS_ON_TWIG, when coord is after
+				 * rightmost unit of the current twig */
+	POS_ON_INTERNAL		/* same as POS_ON_LEAF, but points to internal node */
+} flushpos_state_t;
+
+/* An encapsulation of the current flush point and all the parameters that are passed
+   through the entire squeeze-and-allocate stage of the flush routine.  A single
+   flush_position object is constructed after left- and right-scanning finishes. */
+struct flush_position {
+	flushpos_state_t state;
+
+	coord_t coord;		/* coord to traverse unformatted nodes */
+	lock_handle lock;	/* current lock we hold */
+	load_count load;	/* load status for current locked formatted node  */
+
+	jnode *child;		/* for passing a reference to unformatted child
+				 * across pos state changes */
+
+	reiser4_blocknr_hint preceder;	/* The flush 'hint' state. */
+	int leaf_relocate;	/* True if enough leaf-level nodes were
+				 * found to suggest a relocate policy. */
+	int alloc_cnt;		/* The number of nodes allocated during squeeze and allococate. */
+	int prep_or_free_cnt;	/* The number of nodes prepared for write (allocate) or squeezed and freed. */
+	flush_queue_t *fq;
+	long *nr_written;	/* number of nodes submitted to disk */
+	int flags;		/* a copy of jnode_flush flags argument */
+
+	znode *prev_twig;	/* previous parent pointer value, used to catch
+				 * processing of new twig node */
+	struct convert_info *sq;	/* convert info */
+
+	unsigned long pos_in_unit;	/* for extents only. Position
+					   within an extent unit of first
+					   jnode of slum */
+	long nr_to_write;	/* number of unformatted nodes to handle on flush */
+};
+
+static inline int item_convert_count(flush_pos_t * pos)
+{
+	return pos->sq->count;
+}
+static inline void inc_item_convert_count(flush_pos_t * pos)
+{
+	pos->sq->count++;
+}
+static inline void set_item_convert_count(flush_pos_t * pos, int count)
+{
+	pos->sq->count = count;
+}
+static inline item_plugin *item_convert_plug(flush_pos_t * pos)
+{
+	return pos->sq->iplug;
+}
+
+static inline struct convert_info *convert_data(flush_pos_t * pos)
+{
+	return pos->sq;
+}
+
+static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
+{
+	assert("edward-955", convert_data(pos));
+	return pos->sq->itm;
+}
+
+static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
+{
+	return &pos->sq->clust.tc;
+}
+
+static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
+						tfm_stream_id id)
+{
+	assert("edward-854", pos->sq != NULL);
+	return get_tfm_stream(tfm_cluster_sq(pos), id);
+}
+
+static inline int chaining_data_present(flush_pos_t * pos)
+{
+	return convert_data(pos) && item_convert_data(pos);
+}
+
+/* Returns true if next node contains next item of the disk cluster
+   so item convert data should be moved to the right slum neighbor.
+*/
+static inline int should_chain_next_node(flush_pos_t * pos)
+{
+	int result = 0;
+
+	assert("edward-1007", chaining_data_present(pos));
+
+	switch (item_convert_data(pos)->d_next) {
+	case DC_CHAINED_ITEM:
+		result = 1;
+		break;
+	case DC_AFTER_CLUSTER:
+		break;
+	default:
+		impossible("edward-1009", "bad state of next slum item");
+	}
+	return result;
+}
+
+/* update item state in a disk cluster to assign conversion mode */
+static inline void
+move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
+{
+
+	assert("edward-1010", chaining_data_present(pos));
+
+	if (this_node == 0) {
+		/* next item is on the right neighbor */
+		assert("edward-1011",
+		       item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
+		       item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
+		assert("edward-1012",
+		       item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
+
+		item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
+		item_convert_data(pos)->d_next = DC_INVALID_STATE;
+	} else {
+		/* next item is on the same node */
+		assert("edward-1013",
+		       item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
+		       item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
+		assert("edward-1227",
+		       item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
+		       item_convert_data(pos)->d_next == DC_INVALID_STATE);
+
+		item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
+		item_convert_data(pos)->d_next = DC_INVALID_STATE;
+	}
+}
+
+static inline int should_convert_node(flush_pos_t * pos, znode * node)
+{
+	return znode_convertible(node);
+}
+
+/* true if there is attached convert item info */
+static inline int should_convert_next_node(flush_pos_t * pos)
+{
+	return convert_data(pos) && item_convert_data(pos);
+}
+
+#define SQUALLOC_THRESHOLD 256
+
+static inline int should_terminate_squalloc(flush_pos_t * pos)
+{
+	return convert_data(pos) &&
+	    !item_convert_data(pos) &&
+	    item_convert_count(pos) >= SQUALLOC_THRESHOLD;
+}
+
+#if 1
+#define check_convert_info(pos)						\
+do {							        	\
+	if (unlikely(should_convert_next_node(pos))){			\
+		warning("edward-1006", "unprocessed chained data");	\
+		printk("d_cur = %d, d_next = %d, flow.len = %llu\n",	\
+		       item_convert_data(pos)->d_cur,			\
+		       item_convert_data(pos)->d_next,			\
+		       item_convert_data(pos)->flow.length);		\
+		printk("inode %llu, size = %llu, cluster %lu\n",	\
+		       (unsigned long long)get_inode_oid		\
+		       (item_convert_data(pos)->inode),			\
+		       i_size_read(item_convert_data(pos)->inode),	\
+		       convert_data(pos)->clust.index);			\
+	}								\
+} while (0)
+#else
+#define check_convert_info(pos)
+#endif /* REISER4_DEBUG */
+
+void free_convert_data(flush_pos_t * pos);
+/* used in extent.c */
+int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
+		     const coord_t * parent);
+int reiser4_scan_finished(flush_scan * scan);
+int reiser4_scanning_left(flush_scan * scan);
+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
+txn_atom *atom_locked_by_fq(flush_queue_t * fq);
+int reiser4_alloc_extent(flush_pos_t *flush_pos);
+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
+			       reiser4_key *stop_key);
+extern int reiser4_init_fqs(void);
+extern void reiser4_done_fqs(void);
+
+#if REISER4_DEBUG
+
+extern void reiser4_check_fq(const txn_atom *atom);
+extern atomic_t flush_cnt;
+
+#define check_preceder(blk) \
+assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
+extern void check_pos(flush_pos_t * pos);
+#else
+#define check_preceder(b) noop
+#define check_pos(pos) noop
+#endif
+
+/* __REISER4_FLUSH_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 90
+   LocalWords:  preceder
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/flush_queue.c linux-2.6.22/fs/reiser4/flush_queue.c
--- linux-2.6.22.orig/fs/reiser4/flush_queue.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/flush_queue.c	2007-07-29 00:25:34.864693371 +0400
@@ -0,0 +1,680 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "debug.h"
+#include "super.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "page_cache.h"
+#include "wander.h"
+#include "vfs_ops.h"
+#include "writeout.h"
+#include "flush.h"
+
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+
+/* A flush queue object is an accumulator for keeping jnodes prepared
+   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
+   kept on the flush queue until memory pressure or atom commit asks
+   flush queues to write some or all from their jnodes. */
+
+/*
+   LOCKING:
+
+   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
+   list protected by atom spin lock.  fq->prepped list uses the following
+   locking:
+
+   two ways to protect fq->prepped list for read-only list traversal:
+
+   1. atom spin-lock atom.
+   2. fq is IN_USE, atom->nr_running_queues increased.
+
+   and one for list modification:
+
+   1. atom is spin-locked and one condition is true: fq is IN_USE or
+      atom->nr_running_queues == 0.
+
+   The deadlock-safe order for flush queues and atoms is: first lock atom, then
+   lock flush queue, then lock jnode.
+*/
+
+#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
+#define fq_ready(fq)           (!fq_in_use(fq))
+
+#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
+#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
+
+/* get lock on atom from locked flush queue object */
+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
+{
+	/* This code is similar to jnode_get_atom(), look at it for the
+	 * explanation. */
+	txn_atom *atom;
+
+	assert_spin_locked(&(fq->guard));
+
+	while (1) {
+		atom = fq->atom;
+		if (atom == NULL)
+			break;
+
+		if (spin_trylock_atom(atom))
+			break;
+
+		atomic_inc(&atom->refcount);
+		spin_unlock(&(fq->guard));
+		spin_lock_atom(atom);
+		spin_lock(&(fq->guard));
+
+		if (fq->atom == atom) {
+			atomic_dec(&atom->refcount);
+			break;
+		}
+
+		spin_unlock(&(fq->guard));
+		atom_dec_and_unlock(atom);
+		spin_lock(&(fq->guard));
+	}
+
+	return atom;
+}
+
+txn_atom *atom_locked_by_fq(flush_queue_t * fq)
+{
+	txn_atom *atom;
+
+	spin_lock(&(fq->guard));
+	atom = atom_locked_by_fq_nolock(fq);
+	spin_unlock(&(fq->guard));
+	return atom;
+}
+
+static void init_fq(flush_queue_t * fq)
+{
+	memset(fq, 0, sizeof *fq);
+
+	atomic_set(&fq->nr_submitted, 0);
+
+	INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
+
+	init_waitqueue_head(&fq->wait);
+	spin_lock_init(&fq->guard);
+}
+
+/* slab for flush queues */
+static struct kmem_cache *fq_slab;
+
+/**
+ * reiser4_init_fqs - create flush queue cache
+ *
+ * Initializes slab cache of flush queues. It is part of reiser4 module
+ * initialization.
+ */
+int reiser4_init_fqs(void)
+{
+	fq_slab = kmem_cache_create("fq",
+				    sizeof(flush_queue_t),
+				    0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (fq_slab == NULL)
+		return RETERR(-ENOMEM);
+	return 0;
+}
+
+/**
+ * reiser4_done_fqs - delete flush queue cache
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_fqs(void)
+{
+	destroy_reiser4_cache(&fq_slab);
+}
+
+/* create new flush queue object */
+static flush_queue_t *create_fq(gfp_t gfp)
+{
+	flush_queue_t *fq;
+
+	fq = kmem_cache_alloc(fq_slab, gfp);
+	if (fq)
+		init_fq(fq);
+
+	return fq;
+}
+
+/* adjust atom's and flush queue's counters of queued nodes */
+static void count_enqueued_node(flush_queue_t * fq)
+{
+	ON_DEBUG(fq->atom->num_queued++);
+}
+
+static void count_dequeued_node(flush_queue_t * fq)
+{
+	assert("zam-993", fq->atom->num_queued > 0);
+	ON_DEBUG(fq->atom->num_queued--);
+}
+
+/* attach flush queue object to the atom */
+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
+{
+	assert_spin_locked(&(atom->alock));
+	list_add(&fq->alink, &atom->flush_queues);
+	fq->atom = atom;
+	ON_DEBUG(atom->nr_flush_queues++);
+}
+
+static void detach_fq(flush_queue_t * fq)
+{
+	assert_spin_locked(&(fq->atom->alock));
+
+	spin_lock(&(fq->guard));
+	list_del_init(&fq->alink);
+	assert("vs-1456", fq->atom->nr_flush_queues > 0);
+	ON_DEBUG(fq->atom->nr_flush_queues--);
+	fq->atom = NULL;
+	spin_unlock(&(fq->guard));
+}
+
+/* destroy flush queue object */
+static void done_fq(flush_queue_t * fq)
+{
+	assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
+	assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
+
+	kmem_cache_free(fq_slab, fq);
+}
+
+/* */
+static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
+{
+	JF_SET(node, JNODE_FLUSH_QUEUED);
+	count_enqueued_node(fq);
+}
+
+/* Putting jnode into the flush queue. Both atom and jnode should be
+   spin-locked. */
+void queue_jnode(flush_queue_t * fq, jnode * node)
+{
+	assert_spin_locked(&(node->guard));
+	assert("zam-713", node->atom != NULL);
+	assert_spin_locked(&(node->atom->alock));
+	assert("zam-716", fq->atom != NULL);
+	assert("zam-717", fq->atom == node->atom);
+	assert("zam-907", fq_in_use(fq));
+
+	assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
+	assert("zam-826", JF_ISSET(node, JNODE_RELOC));
+	assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
+	assert("vs-1481", NODE_LIST(node) != FQ_LIST);
+
+	mark_jnode_queued(fq, node);
+	list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
+
+	ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
+			     FQ_LIST, 1));
+}
+
+/* repeatable process for waiting io completion on a flush queue object */
+static int wait_io(flush_queue_t * fq, int *nr_io_errors)
+{
+	assert("zam-738", fq->atom != NULL);
+	assert_spin_locked(&(fq->atom->alock));
+	assert("zam-736", fq_in_use(fq));
+	assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
+
+	if (atomic_read(&fq->nr_submitted) != 0) {
+		struct super_block *super;
+
+		spin_unlock_atom(fq->atom);
+
+		assert("nikita-3013", reiser4_schedulable());
+
+		super = reiser4_get_current_sb();
+
+		/* FIXME: this is instead of blk_run_queues() */
+		blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
+
+		if (!(super->s_flags & MS_RDONLY))
+			wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
+
+		/* Ask the caller to re-acquire the locks and call this
+		   function again. Note: this technique is commonly used in
+		   the txnmgr code. */
+		return -E_REPEAT;
+	}
+
+	*nr_io_errors += atomic_read(&fq->nr_errors);
+	return 0;
+}
+
+/* wait on I/O completion, re-submit dirty nodes to write */
+static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
+{
+	int ret;
+	txn_atom *atom = fq->atom;
+
+	assert("zam-801", atom != NULL);
+	assert_spin_locked(&(atom->alock));
+	assert("zam-762", fq_in_use(fq));
+
+	ret = wait_io(fq, nr_io_errors);
+	if (ret)
+		return ret;
+
+	detach_fq(fq);
+	done_fq(fq);
+
+	reiser4_atom_send_event(atom);
+
+	return 0;
+}
+
+/* wait for all i/o for given atom to be completed, actually do one iteration
+   on that and return -E_REPEAT if there more iterations needed */
+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
+{
+	flush_queue_t *fq;
+
+	assert_spin_locked(&(atom->alock));
+
+	if (list_empty_careful(&atom->flush_queues))
+		return 0;
+
+	list_for_each_entry(fq, &atom->flush_queues, alink) {
+		if (fq_ready(fq)) {
+			int ret;
+
+			mark_fq_in_use(fq);
+			assert("vs-1247", fq->owner == NULL);
+			ON_DEBUG(fq->owner = current);
+			ret = finish_fq(fq, nr_io_errors);
+
+			if (*nr_io_errors)
+				reiser4_handle_error();
+
+			if (ret) {
+				reiser4_fq_put(fq);
+				return ret;
+			}
+
+			spin_unlock_atom(atom);
+
+			return -E_REPEAT;
+		}
+	}
+
+	/* All flush queues are in use; atom remains locked */
+	return -EBUSY;
+}
+
+/* wait all i/o for current atom */
+int current_atom_finish_all_fq(void)
+{
+	txn_atom *atom;
+	int nr_io_errors = 0;
+	int ret = 0;
+
+	do {
+		while (1) {
+			atom = get_current_atom_locked();
+			ret = finish_all_fq(atom, &nr_io_errors);
+			if (ret != -EBUSY)
+				break;
+			reiser4_atom_wait_event(atom);
+		}
+	} while (ret == -E_REPEAT);
+
+	/* we do not need locked atom after this function finishes, SUCCESS or
+	   -EBUSY are two return codes when atom remains locked after
+	   finish_all_fq */
+	if (!ret)
+		spin_unlock_atom(atom);
+
+	assert_spin_not_locked(&(atom->alock));
+
+	if (ret)
+		return ret;
+
+	if (nr_io_errors)
+		return RETERR(-EIO);
+
+	return 0;
+}
+
+/* change node->atom field for all jnode from given list */
+static void
+scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
+{
+	jnode *cur;
+
+	list_for_each_entry(cur, list, capture_link) {
+		spin_lock_jnode(cur);
+		cur->atom = atom;
+		spin_unlock_jnode(cur);
+	}
+}
+
+/* support for atom fusion operation */
+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
+{
+	flush_queue_t *fq;
+
+	assert_spin_locked(&(to->alock));
+	assert_spin_locked(&(from->alock));
+
+	list_for_each_entry(fq, &from->flush_queues, alink) {
+		scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
+		spin_lock(&(fq->guard));
+		fq->atom = to;
+		spin_unlock(&(fq->guard));
+	}
+
+	list_splice_init(&from->flush_queues, to->flush_queues.prev);
+
+#if REISER4_DEBUG
+	to->num_queued += from->num_queued;
+	to->nr_flush_queues += from->nr_flush_queues;
+	from->nr_flush_queues = 0;
+#endif
+}
+
+#if REISER4_DEBUG
+int atom_fq_parts_are_clean(txn_atom * atom)
+{
+	assert("zam-915", atom != NULL);
+	return list_empty_careful(&atom->flush_queues);
+}
+#endif
+/* Bio i/o completion routine for reiser4 write operations. */
+static int
+end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
+	       int err)
+{
+	int i;
+	int nr_errors = 0;
+	flush_queue_t *fq;
+
+	assert("zam-958", bio->bi_rw & WRITE);
+
+	/* i/o op. is not fully completed */
+	if (bio->bi_size != 0)
+		return 1;
+
+	if (err == -EOPNOTSUPP)
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+
+	/* we expect that bio->private is set to NULL or fq object which is used
+	 * for synchronization and error counting. */
+	fq = bio->bi_private;
+	/* Check all elements of io_vec for correct write completion. */
+	for (i = 0; i < bio->bi_vcnt; i += 1) {
+		struct page *pg = bio->bi_io_vec[i].bv_page;
+
+		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+			SetPageError(pg);
+			nr_errors++;
+		}
+
+		{
+			/* jnode WRITEBACK ("write is in progress bit") is
+			 * atomically cleared here. */
+			jnode *node;
+
+			assert("zam-736", pg != NULL);
+			assert("zam-736", PagePrivate(pg));
+			node = jprivate(pg);
+
+			JF_CLR(node, JNODE_WRITEBACK);
+		}
+
+		end_page_writeback(pg);
+		page_cache_release(pg);
+	}
+
+	if (fq) {
+		/* count i/o error in fq object */
+		atomic_add(nr_errors, &fq->nr_errors);
+
+		/* If all write requests registered in this "fq" are done we up
+		 * the waiter. */
+		if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
+			wake_up(&fq->wait);
+	}
+
+	bio_put(bio);
+	return 0;
+}
+
+/* Count I/O requests which will be submitted by @bio in given flush queues
+   @fq */
+void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
+{
+	bio->bi_private = fq;
+	bio->bi_end_io = end_io_handler;
+
+	if (fq)
+		atomic_add(bio->bi_vcnt, &fq->nr_submitted);
+}
+
+/* Move all queued nodes out from @fq->prepped list. */
+static void release_prepped_list(flush_queue_t * fq)
+{
+	txn_atom *atom;
+
+	assert("zam-904", fq_in_use(fq));
+	atom = atom_locked_by_fq(fq);
+
+	while (!list_empty(ATOM_FQ_LIST(fq))) {
+		jnode *cur;
+
+		cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
+		list_del_init(&cur->capture_link);
+
+		count_dequeued_node(fq);
+		spin_lock_jnode(cur);
+		assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
+		assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
+		assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
+		JF_CLR(cur, JNODE_FLUSH_QUEUED);
+
+		if (JF_ISSET(cur, JNODE_DIRTY)) {
+			list_add_tail(&cur->capture_link,
+				      ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
+			ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
+					     DIRTY_LIST, 1));
+		} else {
+			list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
+			ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
+					     CLEAN_LIST, 1));
+		}
+
+		spin_unlock_jnode(cur);
+	}
+
+	if (--atom->nr_running_queues == 0)
+		reiser4_atom_send_event(atom);
+
+	spin_unlock_atom(atom);
+}
+
+/* Submit write requests for nodes on the already filled flush queue @fq.
+
+   @fq: flush queue object which contains jnodes we can (and will) write.
+   @return: number of submitted blocks (>=0) if success, otherwise -- an error
+            code (<0). */
+int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
+{
+	int ret;
+	txn_atom *atom;
+
+	while (1) {
+		atom = atom_locked_by_fq(fq);
+		assert("zam-924", atom);
+		/* do not write fq in parallel. */
+		if (atom->nr_running_queues == 0
+		    || !(flags & WRITEOUT_SINGLE_STREAM))
+			break;
+		reiser4_atom_wait_event(atom);
+	}
+
+	atom->nr_running_queues++;
+	spin_unlock_atom(atom);
+
+	ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
+	release_prepped_list(fq);
+
+	return ret;
+}
+
+/* Getting flush queue object for exclusive use by one thread. May require
+   several iterations which is indicated by -E_REPEAT return code.
+
+   This function does not contain code for obtaining an atom lock because an
+   atom lock is obtained by different ways in different parts of reiser4,
+   usually it is current atom, but we need a possibility for getting fq for the
+   atom of given jnode. */
+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
+{
+	flush_queue_t *fq;
+
+	assert_spin_locked(&(atom->alock));
+
+	fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
+	while (&atom->flush_queues != &fq->alink) {
+		spin_lock(&(fq->guard));
+
+		if (fq_ready(fq)) {
+			mark_fq_in_use(fq);
+			assert("vs-1246", fq->owner == NULL);
+			ON_DEBUG(fq->owner = current);
+			spin_unlock(&(fq->guard));
+
+			if (*new_fq)
+				done_fq(*new_fq);
+
+			*new_fq = fq;
+
+			return 0;
+		}
+
+		spin_unlock(&(fq->guard));
+
+		fq = list_entry(fq->alink.next, flush_queue_t, alink);
+	}
+
+	/* Use previously allocated fq object */
+	if (*new_fq) {
+		mark_fq_in_use(*new_fq);
+		assert("vs-1248", (*new_fq)->owner == 0);
+		ON_DEBUG((*new_fq)->owner = current);
+		attach_fq(atom, *new_fq);
+
+		return 0;
+	}
+
+	spin_unlock_atom(atom);
+
+	*new_fq = create_fq(gfp);
+
+	if (*new_fq == NULL)
+		return RETERR(-ENOMEM);
+
+	return RETERR(-E_REPEAT);
+}
+
+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
+{
+	return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
+}
+
+/* A wrapper around reiser4_fq_by_atom for getting a flush queue
+   object for current atom, if success fq->atom remains locked. */
+flush_queue_t *get_fq_for_current_atom(void)
+{
+	flush_queue_t *fq = NULL;
+	txn_atom *atom;
+	int ret;
+
+	do {
+		atom = get_current_atom_locked();
+		ret = reiser4_fq_by_atom(atom, &fq);
+	} while (ret == -E_REPEAT);
+
+	if (ret)
+		return ERR_PTR(ret);
+	return fq;
+}
+
+/* Releasing flush queue object after exclusive use */
+void reiser4_fq_put_nolock(flush_queue_t *fq)
+{
+	assert("zam-747", fq->atom != NULL);
+	assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
+	mark_fq_ready(fq);
+	assert("vs-1245", fq->owner == current);
+	ON_DEBUG(fq->owner = NULL);
+}
+
+void reiser4_fq_put(flush_queue_t * fq)
+{
+	txn_atom *atom;
+
+	spin_lock(&(fq->guard));
+	atom = atom_locked_by_fq_nolock(fq);
+
+	assert("zam-746", atom != NULL);
+
+	reiser4_fq_put_nolock(fq);
+	reiser4_atom_send_event(atom);
+
+	spin_unlock(&(fq->guard));
+	spin_unlock_atom(atom);
+}
+
+/* A part of atom object initialization related to the embedded flush queue
+   list head */
+
+void init_atom_fq_parts(txn_atom *atom)
+{
+	INIT_LIST_HEAD(&atom->flush_queues);
+}
+
+#if REISER4_DEBUG
+
+void reiser4_check_fq(const txn_atom *atom)
+{
+	/* check number of nodes on all atom's flush queues */
+	flush_queue_t *fq;
+	int count;
+	struct list_head *pos;
+
+	count = 0;
+	list_for_each_entry(fq, &atom->flush_queues, alink) {
+		spin_lock(&(fq->guard));
+		/* calculate number of jnodes on fq' list of prepped jnodes */
+		list_for_each(pos, ATOM_FQ_LIST(fq))
+			count++;
+		spin_unlock(&(fq->guard));
+	}
+	if (count != atom->fq)
+		warning("", "fq counter %d, real %d\n", atom->fq, count);
+
+}
+
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/forward.h linux-2.6.22/fs/reiser4/forward.h
--- linux-2.6.22.orig/fs/reiser4/forward.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/forward.h	2007-07-29 00:25:34.864693371 +0400
@@ -0,0 +1,252 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Forward declarations. Thank you Kernighan. */
+
+#if !defined( __REISER4_FORWARD_H__ )
+#define __REISER4_FORWARD_H__
+
+#include <asm/errno.h>
+#include <linux/types.h>
+
+typedef struct zlock zlock;
+typedef struct lock_stack lock_stack;
+typedef struct lock_handle lock_handle;
+typedef struct znode znode;
+typedef struct flow flow_t;
+typedef struct coord coord_t;
+typedef struct tree_access_pointer tap_t;
+typedef struct reiser4_object_create_data reiser4_object_create_data;
+typedef union reiser4_plugin reiser4_plugin;
+typedef __u16 reiser4_plugin_id;
+typedef __u64 reiser4_plugin_groups;
+typedef struct item_plugin item_plugin;
+typedef struct jnode_plugin jnode_plugin;
+typedef struct reiser4_item_data reiser4_item_data;
+typedef union reiser4_key reiser4_key;
+typedef struct reiser4_tree reiser4_tree;
+typedef struct carry_cut_data carry_cut_data;
+typedef struct carry_kill_data carry_kill_data;
+typedef struct carry_tree_op carry_tree_op;
+typedef struct carry_tree_node carry_tree_node;
+typedef struct carry_plugin_info carry_plugin_info;
+typedef struct reiser4_journal reiser4_journal;
+typedef struct txn_atom txn_atom;
+typedef struct txn_handle txn_handle;
+typedef struct txn_mgr txn_mgr;
+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
+typedef struct reiser4_context reiser4_context;
+typedef struct carry_level carry_level;
+typedef struct blocknr_set_entry blocknr_set_entry;
+/* super_block->s_fs_info points to this */
+typedef struct reiser4_super_info_data reiser4_super_info_data;
+/* next two objects are fields of reiser4_super_info_data */
+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
+typedef struct reiser4_space_allocator reiser4_space_allocator;
+
+typedef struct flush_scan flush_scan;
+typedef struct flush_position flush_pos_t;
+
+typedef unsigned short pos_in_node_t;
+#define MAX_POS_IN_NODE 65535
+
+typedef struct jnode jnode;
+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
+
+typedef struct uf_coord uf_coord_t;
+typedef struct hint hint_t;
+
+typedef struct ktxnmgrd_context ktxnmgrd_context;
+
+struct inode;
+struct page;
+struct file;
+struct dentry;
+struct super_block;
+
+/* return values of coord_by_key(). cbk == coord_by_key */
+typedef enum {
+	CBK_COORD_FOUND = 0,
+	CBK_COORD_NOTFOUND = -ENOENT,
+} lookup_result;
+
+/* results of lookup with directory file */
+typedef enum {
+	FILE_NAME_FOUND = 0,
+	FILE_NAME_NOTFOUND = -ENOENT,
+	FILE_IO_ERROR = -EIO,	/* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
+	FILE_OOM = -ENOMEM	/* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
+} file_lookup_result;
+
+/* behaviors of lookup. If coord we are looking for is actually in a tree,
+    both coincide. */
+typedef enum {
+	/* search exactly for the coord with key given */
+	FIND_EXACT,
+	/* search for coord with the maximal key not greater than one
+	   given */
+	FIND_MAX_NOT_MORE_THAN	/*LEFT_SLANT_BIAS */
+} lookup_bias;
+
+typedef enum {
+	/* number of leaf level of the tree
+	   The fake root has (tree_level=0). */
+	LEAF_LEVEL = 1,
+
+	/* number of level one above leaf level of the tree.
+
+	   It is supposed that internal tree used by reiser4 to store file
+	   system data and meta data will have height 2 initially (when
+	   created by mkfs).
+	 */
+	TWIG_LEVEL = 2,
+} tree_level;
+
+/* The "real" maximum ztree height is the 0-origin size of any per-level
+   array, since the zero'th level is not used. */
+#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
+
+/* enumeration of possible mutual position of item and coord.  This enum is
+    return type of ->is_in_item() item plugin method which see. */
+typedef enum {
+	/* coord is on the left of an item */
+	IP_ON_THE_LEFT,
+	/* coord is inside item */
+	IP_INSIDE,
+	/* coord is inside item, but to the right of the rightmost unit of
+	   this item */
+	IP_RIGHT_EDGE,
+	/* coord is on the right of an item */
+	IP_ON_THE_RIGHT
+} interposition;
+
+/* type of lock to acquire on znode before returning it to caller */
+typedef enum {
+	ZNODE_NO_LOCK = 0,
+	ZNODE_READ_LOCK = 1,
+	ZNODE_WRITE_LOCK = 2,
+} znode_lock_mode;
+
+/* type of lock request */
+typedef enum {
+	ZNODE_LOCK_LOPRI = 0,
+	ZNODE_LOCK_HIPRI = (1 << 0),
+
+	/* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
+	   waiting for the lock to become available.  If the lock is unavailable, reiser4_znode_lock will immediately
+	   return the value -E_REPEAT. */
+	ZNODE_LOCK_NONBLOCK = (1 << 1),
+	/* An option for longterm_lock_znode which prevents atom fusion */
+	ZNODE_LOCK_DONT_FUSE = (1 << 2)
+} znode_lock_request;
+
+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
+
+/* used to specify direction of shift. These must be -1 and 1 */
+typedef enum {
+	SHIFT_LEFT = 1,
+	SHIFT_RIGHT = -1
+} shift_direction;
+
+typedef enum {
+	LEFT_SIDE,
+	RIGHT_SIDE
+} sideof;
+
+#define round_up( value, order )						\
+	( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) &	\
+			     ~( ( order ) - 1 ) ) )
+
+/* values returned by squalloc_right_neighbor and its auxiliary functions */
+typedef enum {
+	/* unit of internal item is moved */
+	SUBTREE_MOVED = 0,
+	/* nothing else can be squeezed into left neighbor */
+	SQUEEZE_TARGET_FULL = 1,
+	/* all content of node is squeezed into its left neighbor */
+	SQUEEZE_SOURCE_EMPTY = 2,
+	/* one more item is copied (this is only returned by
+	   allocate_and_copy_extent to squalloc_twig)) */
+	SQUEEZE_CONTINUE = 3
+} squeeze_result;
+
+/* Do not change items ids. If you do - there will be format change */
+typedef enum {
+	STATIC_STAT_DATA_ID = 0x0,
+	SIMPLE_DIR_ENTRY_ID = 0x1,
+	COMPOUND_DIR_ID = 0x2,
+	NODE_POINTER_ID = 0x3,
+	EXTENT_POINTER_ID = 0x5,
+	FORMATTING_ID = 0x6,
+	CTAIL_ID = 0x7,
+	BLACK_BOX_ID = 0x8,
+	LAST_ITEM_ID = 0x9
+} item_id;
+
+/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
+   whether commit() was called or VM memory pressure was applied. */
+typedef enum {
+	/* submit flush queue to disk at jnode_flush completion */
+	JNODE_FLUSH_WRITE_BLOCKS = 1,
+
+	/* flush is called for commit */
+	JNODE_FLUSH_COMMIT = 2,
+	/* not implemented */
+	JNODE_FLUSH_MEMORY_FORMATTED = 4,
+
+	/* not implemented */
+	JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
+} jnode_flush_flags;
+
+/* Flags to insert/paste carry operations. Currently they only used in
+   flushing code, but in future, they can be used to optimize for repetitive
+   accesses.  */
+typedef enum {
+	/* carry is not allowed to shift data to the left when trying to find
+	   free space  */
+	COPI_DONT_SHIFT_LEFT = (1 << 0),
+	/* carry is not allowed to shift data to the right when trying to find
+	   free space  */
+	COPI_DONT_SHIFT_RIGHT = (1 << 1),
+	/* carry is not allowed to allocate new node(s) when trying to find
+	   free space */
+	COPI_DONT_ALLOCATE = (1 << 2),
+	/* try to load left neighbor if its not in a cache */
+	COPI_LOAD_LEFT = (1 << 3),
+	/* try to load right neighbor if its not in a cache */
+	COPI_LOAD_RIGHT = (1 << 4),
+	/* shift insertion point to the left neighbor */
+	COPI_GO_LEFT = (1 << 5),
+	/* shift insertion point to the right neighbor */
+	COPI_GO_RIGHT = (1 << 6),
+	/* try to step back into original node if insertion into new node
+	   fails after shifting data there. */
+	COPI_STEP_BACK = (1 << 7)
+} cop_insert_flag;
+
+typedef enum {
+	SAFE_UNLINK,		/* safe-link for unlink */
+	SAFE_TRUNCATE		/* safe-link for truncate */
+} reiser4_safe_link_t;
+
+/* this is to show on which list of atom jnode is */
+typedef enum {
+	NOT_CAPTURED,
+	DIRTY_LIST,
+	CLEAN_LIST,
+	FQ_LIST,
+	WB_LIST,
+	OVRWR_LIST
+} atom_list;
+
+/* __REISER4_FORWARD_H__ */
+#endif
+
+/* Make Linus happy.
+   Local variables:
+   c-indentation-style: "K&R"
+   mode-name: "LC"
+   c-basic-offset: 8
+   tab-width: 8
+   fill-column: 120
+   End:
+*/
diff -urN linux-2.6.22.orig/fs/reiser4/fsdata.c linux-2.6.22/fs/reiser4/fsdata.c
--- linux-2.6.22.orig/fs/reiser4/fsdata.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/fsdata.c	2007-07-29 00:25:34.868694406 +0400
@@ -0,0 +1,808 @@
+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "fsdata.h"
+#include "inode.h"
+
+
+/* cache or dir_cursors */
+static struct kmem_cache *d_cursor_cache;
+static struct shrinker *d_cursor_shrinker;
+
+/* list of unused cursors */
+static LIST_HEAD(cursor_cache);
+
+/* number of cursors in list of ununsed cursors */
+static unsigned long d_cursor_unused = 0;
+
+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
+DEFINE_SPINLOCK(d_lock);
+
+static reiser4_file_fsdata *create_fsdata(struct file *file);
+static int file_is_stateless(struct file *file);
+static void free_fsdata(reiser4_file_fsdata *fsdata);
+static void kill_cursor(dir_cursor *);
+
+/**
+ * d_cursor_shrink - shrink callback for cache of dir_cursor-s
+ * @nr: number of objects to free
+ * @mask: GFP mask
+ *
+ * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
+ * number. Return number of still freeable cursors.
+ */
+static int d_cursor_shrink(int nr, gfp_t mask)
+{
+	if (nr != 0) {
+		dir_cursor *scan;
+		int killed;
+
+		killed = 0;
+		spin_lock(&d_lock);
+		while (!list_empty(&cursor_cache)) {
+			scan = list_entry(cursor_cache.next, dir_cursor, alist);
+			assert("nikita-3567", scan->ref == 0);
+			kill_cursor(scan);
+			++killed;
+			--nr;
+			if (nr == 0)
+				break;
+		}
+		spin_unlock(&d_lock);
+	}
+	return d_cursor_unused;
+}
+
+/**
+ * reiser4_init_d_cursor - create d_cursor cache
+ *
+ * Initializes slab cache of d_cursors. It is part of reiser4 module
+ * initialization.
+ */
+int reiser4_init_d_cursor(void)
+{
+	d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
+					   SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (d_cursor_cache == NULL)
+		return RETERR(-ENOMEM);
+
+	/*
+	 * actually, d_cursors are "priceless", because there is no way to
+	 * recover information stored in them. On the other hand, we don't
+	 * want to consume all kernel memory by them. As a compromise, just
+	 * assign higher "seeks" value to d_cursor cache, so that it will be
+	 * shrunk only if system is really tight on memory.
+	 */
+	d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
+					 d_cursor_shrink);
+	if (d_cursor_shrinker == NULL) {
+		destroy_reiser4_cache(&d_cursor_cache);
+		d_cursor_cache = NULL;
+		return RETERR(-ENOMEM);
+	}
+	return 0;
+}
+
+/**
+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_d_cursor(void)
+{
+	BUG_ON(d_cursor_shrinker == NULL);
+	remove_shrinker(d_cursor_shrinker);
+	d_cursor_shrinker = NULL;
+
+	destroy_reiser4_cache(&d_cursor_cache);
+}
+
+#define D_CURSOR_TABLE_SIZE (256)
+
+static inline unsigned long
+d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
+{
+	assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
+	return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
+}
+
+static inline int d_cursor_eq(const struct d_cursor_key *k1,
+			      const struct d_cursor_key *k2)
+{
+	return k1->cid == k2->cid && k1->oid == k2->oid;
+}
+
+/*
+ * define functions to manipulate reiser4 super block's hash table of
+ * dir_cursors
+ */
+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
+#define KFREE(ptr, size) kfree(ptr)
+TYPE_SAFE_HASH_DEFINE(d_cursor,
+		      dir_cursor,
+		      struct d_cursor_key,
+		      key, hash, d_cursor_hash, d_cursor_eq);
+#undef KFREE
+#undef KMALLOC
+
+/**
+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
+ * @super: super block to initialize
+ *
+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
+ * of mount.
+ */
+int reiser4_init_super_d_info(struct super_block *super)
+{
+	struct d_cursor_info *p;
+
+	p = &get_super_private(super)->d_info;
+
+	INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
+	return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
+}
+
+/**
+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
+ * @super: super block being umounted
+ *
+ * It is called on umount. Kills all directory cursors attached to suoer block.
+ */
+void reiser4_done_super_d_info(struct super_block *super)
+{
+	struct d_cursor_info *d_info;
+	dir_cursor *cursor, *next;
+
+	d_info = &get_super_private(super)->d_info;
+	for_all_in_htable(&d_info->table, d_cursor, cursor, next)
+		kill_cursor(cursor);
+
+	BUG_ON(d_info->tree.rnode != NULL);
+	d_cursor_hash_done(&d_info->table);
+}
+
+/**
+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
+ * @cursor: cursor to free
+ *
+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
+ * indices, hash table, list of unused cursors and frees it.
+ */
+static void kill_cursor(dir_cursor *cursor)
+{
+	unsigned long index;
+
+	assert("nikita-3566", cursor->ref == 0);
+	assert("nikita-3572", cursor->fsdata != NULL);
+
+	index = (unsigned long)cursor->key.oid;
+	list_del_init(&cursor->fsdata->dir.linkage);
+	free_fsdata(cursor->fsdata);
+	cursor->fsdata = NULL;
+
+	if (list_empty_careful(&cursor->list))
+		/* this is last cursor for a file. Kill radix-tree entry */
+		radix_tree_delete(&cursor->info->tree, index);
+	else {
+		void **slot;
+
+		/*
+		 * there are other cursors for the same oid.
+		 */
+
+		/*
+		 * if radix tree point to the cursor being removed, re-target
+		 * radix tree slot to the next cursor in the (non-empty as was
+		 * checked above) element of the circular list of all cursors
+		 * for this oid.
+		 */
+		slot = radix_tree_lookup_slot(&cursor->info->tree, index);
+		assert("nikita-3571", *slot != NULL);
+		if (*slot == cursor)
+			*slot = list_entry(cursor->list.next, dir_cursor, list);
+		/* remove cursor from circular list */
+		list_del_init(&cursor->list);
+	}
+	/* remove cursor from the list of unused cursors */
+	list_del_init(&cursor->alist);
+	/* remove cursor from the hash table */
+	d_cursor_hash_remove(&cursor->info->table, cursor);
+	/* and free it */
+	kmem_cache_free(d_cursor_cache, cursor);
+	--d_cursor_unused;
+}
+
+/* possible actions that can be performed on all cursors for the given file */
+enum cursor_action {
+	/*
+	 * load all detached state: this is called when stat-data is loaded
+	 * from the disk to recover information about all pending readdirs
+	 */
+	CURSOR_LOAD,
+	/*
+	 * detach all state from inode, leaving it in the cache. This is called
+	 * when inode is removed form the memory by memory pressure
+	 */
+	CURSOR_DISPOSE,
+	/*
+	 * detach cursors from the inode, and free them. This is called when
+	 * inode is destroyed
+	 */
+	CURSOR_KILL
+};
+
+/*
+ * return d_cursor data for the file system @inode is in.
+ */
+static inline struct d_cursor_info *d_info(struct inode *inode)
+{
+	return &get_super_private(inode->i_sb)->d_info;
+}
+
+/*
+ * lookup d_cursor in the per-super-block radix tree.
+ */
+static inline dir_cursor *lookup(struct d_cursor_info * info,
+				 unsigned long index)
+{
+	return (dir_cursor *) radix_tree_lookup(&info->tree, index);
+}
+
+/*
+ * attach @cursor to the radix tree. There may be multiple cursors for the
+ * same oid, they are chained into circular list.
+ */
+static void bind_cursor(dir_cursor * cursor, unsigned long index)
+{
+	dir_cursor *head;
+
+	head = lookup(cursor->info, index);
+	if (head == NULL) {
+		/* this is the first cursor for this index */
+		INIT_LIST_HEAD(&cursor->list);
+		radix_tree_insert(&cursor->info->tree, index, cursor);
+	} else {
+		/* some cursor already exists. Chain ours */
+		list_add(&cursor->list, &head->list);
+	}
+}
+
+/*
+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
+ * "unused" list. Called when file descriptor is not longer in active use.
+ */
+static void clean_fsdata(struct file *file)
+{
+	dir_cursor *cursor;
+	reiser4_file_fsdata *fsdata;
+
+	assert("nikita-3570", file_is_stateless(file));
+
+	fsdata = (reiser4_file_fsdata *) file->private_data;
+	if (fsdata != NULL) {
+		cursor = fsdata->cursor;
+		if (cursor != NULL) {
+			spin_lock(&d_lock);
+			--cursor->ref;
+			if (cursor->ref == 0) {
+				list_add_tail(&cursor->alist, &cursor_cache);
+				++d_cursor_unused;
+			}
+			spin_unlock(&d_lock);
+			file->private_data = NULL;
+		}
+	}
+}
+
+/*
+ * global counter used to generate "client ids". These ids are encoded into
+ * high bits of fpos.
+ */
+static __u32 cid_counter = 0;
+#define CID_SHIFT (20)
+#define CID_MASK  (0xfffffull)
+
+static void free_file_fsdata_nolock(struct file *);
+
+/**
+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
+ * @cursor:
+ * @file:
+ * @inode:
+ *
+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
+ * reiser4 super block's hash table and radix tree.
+ add detachable readdir
+ * state to the @f
+ */
+static int insert_cursor(dir_cursor *cursor, struct file *file,
+			 struct inode *inode)
+{
+	int result;
+	reiser4_file_fsdata *fsdata;
+
+	memset(cursor, 0, sizeof *cursor);
+
+	/* this is either first call to readdir, or rewind. Anyway, create new
+	 * cursor. */
+	fsdata = create_fsdata(NULL);
+	if (fsdata != NULL) {
+		result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
+		if (result == 0) {
+			struct d_cursor_info *info;
+			oid_t oid;
+
+			info = d_info(inode);
+			oid = get_inode_oid(inode);
+			/* cid occupies higher 12 bits of f->f_pos. Don't
+			 * allow it to become negative: this confuses
+			 * nfsd_readdir() */
+			cursor->key.cid = (++cid_counter) & 0x7ff;
+			cursor->key.oid = oid;
+			cursor->fsdata = fsdata;
+			cursor->info = info;
+			cursor->ref = 1;
+
+			spin_lock_inode(inode);
+			/* install cursor as @f's private_data, discarding old
+			 * one if necessary */
+#if REISER4_DEBUG
+			if (file->private_data)
+				warning("", "file has fsdata already");
+#endif
+			clean_fsdata(file);
+			free_file_fsdata_nolock(file);
+			file->private_data = fsdata;
+			fsdata->cursor = cursor;
+			spin_unlock_inode(inode);
+			spin_lock(&d_lock);
+			/* insert cursor into hash table */
+			d_cursor_hash_insert(&info->table, cursor);
+			/* and chain it into radix-tree */
+			bind_cursor(cursor, (unsigned long)oid);
+			spin_unlock(&d_lock);
+			radix_tree_preload_end();
+			file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
+		}
+	} else
+		result = RETERR(-ENOMEM);
+	return result;
+}
+
+/**
+ * process_cursors - do action on each cursor attached to inode
+ * @inode:
+ * @act: action to do
+ *
+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
+ * and performs action specified by @act on each of cursors.
+ */
+static void process_cursors(struct inode *inode, enum cursor_action act)
+{
+	oid_t oid;
+	dir_cursor *start;
+	struct list_head *head;
+	reiser4_context *ctx;
+	struct d_cursor_info *info;
+
+	/* this can be called by
+	 *
+	 * kswapd->...->prune_icache->..reiser4_destroy_inode
+	 *
+	 * without reiser4_context
+	 */
+	ctx = reiser4_init_context(inode->i_sb);
+	if (IS_ERR(ctx)) {
+		warning("vs-23", "failed to init context");
+		return;
+	}
+
+	assert("nikita-3558", inode != NULL);
+
+	info = d_info(inode);
+	oid = get_inode_oid(inode);
+	spin_lock_inode(inode);
+	head = get_readdir_list(inode);
+	spin_lock(&d_lock);
+	/* find any cursor for this oid: reference to it is hanging of radix
+	 * tree */
+	start = lookup(info, (unsigned long)oid);
+	if (start != NULL) {
+		dir_cursor *scan;
+		reiser4_file_fsdata *fsdata;
+
+		/* process circular list of cursors for this oid */
+		scan = start;
+		do {
+			dir_cursor *next;
+
+			next = list_entry(scan->list.next, dir_cursor, list);
+			fsdata = scan->fsdata;
+			assert("nikita-3557", fsdata != NULL);
+			if (scan->key.oid == oid) {
+				switch (act) {
+				case CURSOR_DISPOSE:
+					list_del_init(&fsdata->dir.linkage);
+					break;
+				case CURSOR_LOAD:
+					list_add(&fsdata->dir.linkage, head);
+					break;
+				case CURSOR_KILL:
+					kill_cursor(scan);
+					break;
+				}
+			}
+			if (scan == next)
+				/* last cursor was just killed */
+				break;
+			scan = next;
+		} while (scan != start);
+	}
+	spin_unlock(&d_lock);
+	/* check that we killed 'em all */
+	assert("nikita-3568",
+	       ergo(act == CURSOR_KILL,
+		    list_empty_careful(get_readdir_list(inode))));
+	assert("nikita-3569",
+	       ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
+	spin_unlock_inode(inode);
+	reiser4_exit_context(ctx);
+}
+
+/**
+ * reiser4_dispose_cursors - removes cursors from inode's list
+ * @inode: inode to dispose cursors of
+ *
+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
+ * attached to cursor from inode's readdir list. This is called when inode is
+ * removed from the memory by memory pressure.
+ */
+void reiser4_dispose_cursors(struct inode *inode)
+{
+	process_cursors(inode, CURSOR_DISPOSE);
+}
+
+/**
+ * reiser4_load_cursors - attach cursors to inode
+ * @inode: inode to load cursors to
+ *
+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
+ * attached to cursor to inode's readdir list. This is done when inode is
+ * loaded into memory.
+ */
+void reiser4_load_cursors(struct inode *inode)
+{
+	process_cursors(inode, CURSOR_LOAD);
+}
+
+/**
+ * reiser4_kill_cursors - kill all inode cursors
+ * @inode: inode to kill cursors of
+ *
+ * Frees all cursors for this inode. This is called when inode is destroyed.
+ */
+void reiser4_kill_cursors(struct inode *inode)
+{
+	process_cursors(inode, CURSOR_KILL);
+}
+
+/**
+ * file_is_stateless -
+ * @file:
+ *
+ * true, if file descriptor @f is created by NFS server by "demand" to serve
+ * one file system operation. This means that there may be "detached state"
+ * for underlying inode.
+ */
+static int file_is_stateless(struct file *file)
+{
+	return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
+}
+
+/**
+ * reiser4_get_dir_fpos -
+ * @dir:
+ *
+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
+ * in the case of stateless directory operation (readdir-over-nfs), client id
+ * was encoded in the high bits of cookie and should me masked off.
+ */
+loff_t reiser4_get_dir_fpos(struct file *dir)
+{
+	if (file_is_stateless(dir))
+		return dir->f_pos & CID_MASK;
+	else
+		return dir->f_pos;
+}
+
+/**
+ * reiser4_attach_fsdata - try to attach fsdata
+ * @file:
+ * @inode:
+ *
+ * Finds or creates cursor for readdir-over-nfs.
+ */
+int reiser4_attach_fsdata(struct file *file, struct inode *inode)
+{
+	loff_t pos;
+	int result;
+	dir_cursor *cursor;
+
+	/*
+	 * we are serialized by inode->i_mutex
+	 */
+	if (!file_is_stateless(file))
+		return 0;
+
+	pos = file->f_pos;
+	result = 0;
+	if (pos == 0) {
+		/*
+		 * first call to readdir (or rewind to the beginning of
+		 * directory)
+		 */
+		cursor = kmem_cache_alloc(d_cursor_cache,
+					  reiser4_ctx_gfp_mask_get());
+		if (cursor != NULL)
+			result = insert_cursor(cursor, file, inode);
+		else
+			result = RETERR(-ENOMEM);
+	} else {
+		/* try to find existing cursor */
+		struct d_cursor_key key;
+
+		key.cid = pos >> CID_SHIFT;
+		key.oid = get_inode_oid(inode);
+		spin_lock(&d_lock);
+		cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
+		if (cursor != NULL) {
+			/* cursor was found */
+			if (cursor->ref == 0) {
+				/* move it from unused list */
+				list_del_init(&cursor->alist);
+				--d_cursor_unused;
+			}
+			++cursor->ref;
+		}
+		spin_unlock(&d_lock);
+		if (cursor != NULL) {
+			spin_lock_inode(inode);
+			assert("nikita-3556", cursor->fsdata->back == NULL);
+			clean_fsdata(file);
+			free_file_fsdata_nolock(file);
+			file->private_data = cursor->fsdata;
+			spin_unlock_inode(inode);
+		}
+	}
+	return result;
+}
+
+/**
+ * reiser4_detach_fsdata - ???
+ * @file:
+ *
+ * detach fsdata, if necessary
+ */
+void reiser4_detach_fsdata(struct file *file)
+{
+	struct inode *inode;
+
+	if (!file_is_stateless(file))
+		return;
+
+	inode = file->f_dentry->d_inode;
+	spin_lock_inode(inode);
+	clean_fsdata(file);
+	spin_unlock_inode(inode);
+}
+
+/* slab for reiser4_dentry_fsdata */
+static struct kmem_cache *dentry_fsdata_cache;
+
+/**
+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
+ *
+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
+ * part of reiser4 module initialization.
+ */
+int reiser4_init_dentry_fsdata(void)
+{
+	dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
+					   sizeof(struct reiser4_dentry_fsdata),
+					   0,
+					   SLAB_HWCACHE_ALIGN |
+					   SLAB_RECLAIM_ACCOUNT, NULL,
+					   NULL);
+	if (dentry_fsdata_cache == NULL)
+		return RETERR(-ENOMEM);
+	return 0;
+}
+
+/**
+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_dentry_fsdata(void)
+{
+	destroy_reiser4_cache(&dentry_fsdata_cache);
+}
+
+/**
+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
+ * @dentry: queried dentry
+ *
+ * Allocates if necessary and returns per-dentry data that we attach to each
+ * dentry.
+ */
+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
+{
+	assert("nikita-1365", dentry != NULL);
+
+	if (dentry->d_fsdata == NULL) {
+		dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
+						    reiser4_ctx_gfp_mask_get());
+		if (dentry->d_fsdata == NULL)
+			return ERR_PTR(RETERR(-ENOMEM));
+		memset(dentry->d_fsdata, 0,
+		       sizeof(struct reiser4_dentry_fsdata));
+	}
+	return dentry->d_fsdata;
+}
+
+/**
+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
+ * @dentry: dentry to free fsdata of
+ *
+ * Detaches and frees fs-specific dentry data
+ */
+void reiser4_free_dentry_fsdata(struct dentry *dentry)
+{
+	if (dentry->d_fsdata != NULL) {
+		kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
+		dentry->d_fsdata = NULL;
+	}
+}
+
+/* slab for reiser4_file_fsdata */
+static struct kmem_cache *file_fsdata_cache;
+
+/**
+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
+ *
+ * Initializes slab cache of structures attached to file->private_data. It is
+ * part of reiser4 module initialization.
+ */
+int reiser4_init_file_fsdata(void)
+{
+	file_fsdata_cache = kmem_cache_create("file_fsdata",
+					      sizeof(reiser4_file_fsdata),
+					      0,
+					      SLAB_HWCACHE_ALIGN |
+					      SLAB_RECLAIM_ACCOUNT, NULL, NULL);
+	if (file_fsdata_cache == NULL)
+		return RETERR(-ENOMEM);
+	return 0;
+}
+
+/**
+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_file_fsdata(void)
+{
+	destroy_reiser4_cache(&file_fsdata_cache);
+}
+
+/**
+ * create_fsdata - allocate and initialize reiser4_file_fsdata
+ * @file: what to create file_fsdata for, may be NULL
+ *
+ * Allocates and initializes reiser4_file_fsdata structure.
+ */
+static reiser4_file_fsdata *create_fsdata(struct file *file)
+{
+	reiser4_file_fsdata *fsdata;
+
+	fsdata = kmem_cache_alloc(file_fsdata_cache,
+				  reiser4_ctx_gfp_mask_get());
+	if (fsdata != NULL) {
+		memset(fsdata, 0, sizeof *fsdata);
+		fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
+		fsdata->back = file;
+		INIT_LIST_HEAD(&fsdata->dir.linkage);
+	}
+	return fsdata;
+}
+
+/**
+ * free_fsdata - free reiser4_file_fsdata
+ * @fsdata: object to free
+ *
+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
+ */
+static void free_fsdata(reiser4_file_fsdata *fsdata)
+{
+	BUG_ON(fsdata == NULL);
+	kmem_cache_free(file_fsdata_cache, fsdata);
+}
+
+/**
+ * reiser4_get_file_fsdata - get fs-specific file data
+ * @file: queried file
+ *
+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
+ * to @file.
+ */
+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
+{
+	assert("nikita-1603", file != NULL);
+
+	if (file->private_data == NULL) {
+		reiser4_file_fsdata *fsdata;
+		struct inode *inode;
+
+		fsdata = create_fsdata(file);
+		if (fsdata == NULL)
+			return ERR_PTR(RETERR(-ENOMEM));
+
+		inode = file->f_dentry->d_inode;
+		spin_lock_inode(inode);
+		if (file->private_data == NULL) {
+			file->private_data = fsdata;
+			fsdata = NULL;
+		}
+		spin_unlock_inode(inode);
+		if (fsdata != NULL)
+			/* other thread initialized ->fsdata */
+			kmem_cache_free(file_fsdata_cache, fsdata);
+	}
+	assert("nikita-2665", file->private_data != NULL);
+	return file->private_data;
+}
+
+/**
+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
+ * @file:
+ *
+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
+ * readdir list, frees if it is not linked to d_cursor object.
+ */
+static void free_file_fsdata_nolock(struct file *file)
+{
+	reiser4_file_fsdata *fsdata;
+
+	assert("", spin_inode_is_locked(file->f_dentry->d_inode));
+	fsdata = file->private_data;
+	if (fsdata != NULL) {
+		list_del_init(&fsdata->dir.linkage);
+		if (fsdata->cursor == NULL)
+			free_fsdata(fsdata);
+	}
+	file->private_data = NULL;
+}
+
+/**
+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
+ * @file:
+ *
+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
+ */
+void reiser4_free_file_fsdata(struct file *file)
+{
+	spin_lock_inode(file->f_dentry->d_inode);
+	free_file_fsdata_nolock(file);
+	spin_unlock_inode(file->f_dentry->d_inode);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/fsdata.h linux-2.6.22/fs/reiser4/fsdata.h
--- linux-2.6.22.orig/fs/reiser4/fsdata.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/fsdata.h	2007-07-29 00:25:34.868694406 +0400
@@ -0,0 +1,205 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#if !defined( __REISER4_FSDATA_H__ )
+#define __REISER4_FSDATA_H__
+
+#include "debug.h"
+#include "kassign.h"
+#include "seal.h"
+#include "type_safe_hash.h"
+#include "plugin/file/file.h"
+#include "readahead.h"
+
+/*
+ * comment about reiser4_dentry_fsdata
+ *
+ *
+ */
+
+/*
+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
+ * protected by ->i_mutex on inode. Under this lock following invariant
+ * holds:
+ *
+ *     file descriptor is "looking" at the entry_no-th directory entry from
+ *     the beginning of directory. This entry has key dir_entry_key and is
+ *     pos-th entry with duplicate-key sequence.
+ *
+ */
+
+/* logical position within directory */
+struct dir_pos {
+	/* key of directory entry (actually, part of a key sufficient to
+	   identify directory entry)  */
+	de_id dir_entry_key;
+	/* ordinal number of directory entry among all entries with the same
+	   key. (Starting from 0.) */
+	unsigned pos;
+};
+
+struct readdir_pos {
+	/* f_pos corresponding to this readdir position */
+	__u64 fpos;
+	/* logical position within directory */
+	struct dir_pos position;
+	/* logical number of directory entry within
+	   directory  */
+	__u64 entry_no;
+};
+
+/*
+ * this is used to speed up lookups for directory entry: on initial call to
+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
+ * in struct dentry and reused later to avoid tree traversals.
+ */
+struct de_location {
+	/* seal covering directory entry */
+	seal_t entry_seal;
+	/* coord of directory entry */
+	coord_t entry_coord;
+	/* ordinal number of directory entry among all entries with the same
+	   key. (Starting from 0.) */
+	int pos;
+};
+
+/**
+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
+ *
+ * This is allocated dynamically and released in d_op->d_release()
+ *
+ * Currently it only contains cached location (hint) of directory entry, but
+ * it is expected that other information will be accumulated here.
+ */
+struct reiser4_dentry_fsdata {
+	/*
+	 * here will go fields filled by ->lookup() to speedup next
+	 * create/unlink, like blocknr of znode with stat-data, or key of
+	 * stat-data.
+	 */
+	struct de_location dec;
+	int stateless;		/* created through reiser4_decode_fh, needs special
+				 * treatment in readdir. */
+};
+
+extern int reiser4_init_dentry_fsdata(void);
+extern void reiser4_done_dentry_fsdata(void);
+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
+
+/**
+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
+ *
+ * This is allocated dynamically and released in inode->i_fop->release
+ */
+typedef struct reiser4_file_fsdata {
+	/*
+	 * pointer back to the struct file which this reiser4_file_fsdata is
+	 * part of
+	 */
+	struct file *back;
+	/* detached cursor for stateless readdir. */
+	struct dir_cursor *cursor;
+	/*
+	 * We need both directory and regular file parts here, because there
+	 * are file system objects that are files and directories.
+	 */
+	struct {
+		/*
+		 * position in directory. It is updated each time directory is
+		 * modified
+		 */
+		struct readdir_pos readdir;
+		/* head of this list is reiser4_inode->lists.readdir_list */
+		struct list_head linkage;
+	} dir;
+	/* hints to speed up operations with regular files: read and write. */
+	struct {
+		hint_t hint;
+	} reg;
+	struct reiser4_file_ra_state ra1;
+
+} reiser4_file_fsdata;
+
+extern int reiser4_init_file_fsdata(void);
+extern void reiser4_done_file_fsdata(void);
+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
+extern void reiser4_free_file_fsdata(struct file *);
+
+/*
+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
+ * used to address problem reiser4 has with readdir accesses via NFS. See
+ * plugin/file_ops_readdir.c for more details.
+ */
+struct d_cursor_key{
+	__u16 cid;
+	__u64 oid;
+};
+
+/*
+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
+ * maintain hash table of dir_cursor-s in reiser4's super block
+ */
+typedef struct dir_cursor dir_cursor;
+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
+
+struct dir_cursor {
+	int ref;
+	reiser4_file_fsdata *fsdata;
+
+	/* link to reiser4 super block hash table of cursors */
+	d_cursor_hash_link hash;
+
+	/*
+	 * this is to link cursors to reiser4 super block's radix tree of
+	 * cursors if there are more than one cursor of the same objectid
+	 */
+	struct list_head list;
+	struct d_cursor_key key;
+	struct d_cursor_info *info;
+	/* list of unused cursors */
+	struct list_head alist;
+};
+
+extern int reiser4_init_d_cursor(void);
+extern void reiser4_done_d_cursor(void);
+
+extern int reiser4_init_super_d_info(struct super_block *);
+extern void reiser4_done_super_d_info(struct super_block *);
+
+extern loff_t reiser4_get_dir_fpos(struct file *);
+extern int reiser4_attach_fsdata(struct file *, struct inode *);
+extern void reiser4_detach_fsdata(struct file *);
+
+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
+   more details */
+void reiser4_dispose_cursors(struct inode *inode);
+void reiser4_load_cursors(struct inode *inode);
+void reiser4_kill_cursors(struct inode *inode);
+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
+			     int offset, int adj);
+
+/*
+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
+ */
+struct d_cursor_info {
+	d_cursor_hash_table table;
+	struct radix_tree_root tree;
+};
+
+/* spinlock protecting readdir cursors */
+extern spinlock_t d_lock;
+
+/* __REISER4_FSDATA_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * End:
+ */
diff -urN linux-2.6.22.orig/fs/reiser4/init_super.c linux-2.6.22/fs/reiser4/init_super.c
--- linux-2.6.22.orig/fs/reiser4/init_super.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.22/fs/reiser4/init_super.c	2007-07-29 00:25:34.868694406 +0400
@@ -0,0 +1,752 @@
+/* Copyright by Hans Reiser, 2003 */
+
+#include "super.h"
+#include "inode.h"
+#include "plugin/plugin_set.h"
+
+#include <linux/swap.h>
+
+/**
+ * init_fs_info - allocate reiser4 specific super block
+ * @super: super block of filesystem
+ *
+ * Allocates and initialize reiser4_super_info_data, attaches it to
+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
+ */
+int reiser4_init_fs_info(struct super_block *super)
+{
+	reiser4_super_info_data *sbinfo;
+
+	sbinfo = kmalloc(sizeof(reiser4_super_info_data),
+			 reiser4_ctx_gfp_mask_get());
+	if (!sbinfo)
+		return RETERR(-ENOMEM);
+
+	super->s_fs_info = sbinfo;
+	super->s_op = NULL;
+	memset(sbinfo, 0, sizeof(*sbinfo));
+
+	ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
+	ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
+
+	mutex_init(&sbinfo->delete_mutex);
+	spin_lock_init(&(sbinfo->guard));
+
+	/*  initialize per-super-block d_cursor resources */
+	reiser4_init_super_d_info(super);
+
+	return 0;
+}
+
+/**
+ * reiser4_done_fs_info - free reiser4 specific super block
+ * @super: super block of filesystem
+ *
+ * Performs some sanity checks, releases structures maintaining d_cursor-s,
+ * frees reiser4_super_info_data.
+ */
+void reiser4_done_fs_info(struct super_block *super)
+{
+	assert("zam-990", super->s_fs_info != NULL);
+
+	/* release per-super-block d_cursor resources */
+	reiser4_done_super_d_info(super);
+
+	/* make sure that there are not jnodes already */
+	assert("", list_empty(&get_super_private(super)->all_jnodes));
+	assert("", get_current_context()->trans->atom == NULL);
+	reiser4_check_block_counters(super);
+	kfree(super->s_fs_info);
+	super->s_fs_info = NULL;
+}
+
+/* type of option parseable by parse_option() */
+typedef enum {
+	/* value of option is arbitrary string */
+	OPT_STRING,
+
+	/*
+	 * option specifies bit in a bitmask. When option is set - bit in
+	 * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
+	 * dont_load_bitmap, atomic_write.
+	 */
+	OPT_BIT,
+
+	/*
+	 * value of option should conform to sprintf() format. Examples are
+	 * tmgr.atom_max_size=N, tmgr.atom_max_age=N
+	 */
+	OPT_FORMAT,
+
+	/*
+	 * option can take one of predefined values. Example is onerror=panic or
+	 * onerror=remount-ro
+	 */
+	OPT_ONEOF,
+} opt_type_t;
+
+#if 0
+struct opt_bitmask_bit {
+	const char *bit_name;
+	int bit_nr;
+};
+#endif
+
+/* description of option parseable by parse_option() */
+struct opt_desc {
+	/* option name.
+
+	   parsed portion of string has a form "name=value".
+	 */
+	const char *name;
+	/* type of option */
+	opt_type_t type;
+	union {
+		/* where to store value of string option (type == OPT_STRING) */
+		char **string;
+		/* description of bits for bit option (type == OPT_BIT) */
+		struct {
+			int nr;
+			void *addr;
+		} bit;
+		/* description of format and targets for format option (type
+		   == OPT_FORMAT) */
+		struct {
+			const char *format;
+			int nr_args;
+			void *arg1;
+			void *arg2;
+			void *arg3;
+			void *arg4;
+		} f;
+		struct {
+			int *result;
+			const char *list[10];
+		} oneof;
+		struct {
+			void *addr;
+			int nr_bits;
+			//struct opt_bitmask_bit *bits;
+		} bitmask;
+	} u;
+};
+
+/**
+ * parse_option - parse one option
+ * @opt_strin: starting point of parsing
+ * @opt: option description
+ *
+ * foo=bar,
+ * ^   ^  ^
+ * |   |  +-- replaced to '\0'
+ * |   +-- val_start
+ * +-- opt_string
+ * Figures out option type and handles option correspondingly.
+ */
+static int parse_option(char *opt_string, struct opt_desc *opt)
+{
+	char *val_start;
+	int result;
+	const char *err_msg;
+
+	/* NOTE-NIKITA think about using lib/cmdline.c functions here. */
+
+	val_start = strchr(opt_string, '=');
+	if (val_start != NULL) {
+		*val_start = '\0';
+		++val_start;
+	}
+
+	err_msg = NULL;
+	result = 0;
+	switch (opt->type) {
+	case OPT_STRING:
+		if (val_start == NULL) {
+			err_msg = "String arg missing";
+			result = RETERR(-EINVAL);
+		} else
+			*opt->u.string = val_start;
+		break;
+	case OPT_BIT:
+		if (val_start != NULL)
+			err_msg = "Value ignored";
+		else
+			set_bit(opt->u.bit.nr, opt->u.bit.addr);
+		break;
+	case OPT_FORMAT:
+		if (val_start == NULL) {
+			err_msg = "Formatted arg missing";
+			result = RETERR(-EINVAL);
+			break;
+		}
+		if (sscanf(val_start, opt->u.f.format,
+			   opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
+			   opt->u.f.arg4) != opt->u.f.nr_args) {
+			err_msg = "Wrong conversion";
+			result = RETERR(-EINVAL);
+		}
+		break;
+	case OPT_ONEOF:
+		{
+			int i = 0;
+
+			if (val_start == NULL) {
+				err_msg = "Value is missing";
+				result = RETERR(-EINVAL);
+				break;
+			}
+			err_msg = "Wrong option value";
+			result = RETERR(-EINVAL);
+			while (opt->u.oneof.list[i]) {
+				if (!strcmp(opt->u.oneof.list[i], val_start)) {
+					result = 0;
+					err_msg = NULL;
+					*opt->u.oneof.result = i;
+					break;
+				}
+				i++;
+			}
+			break;
+		}
+	default:
+		wrong_return_value("nikita-2100", "opt -> type");
+		break;
+	}
+	if (err_msg != NULL) {
+		warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
+			err_msg, opt->name, val_start ? "=" : "",
+			val_start ? : "");
+	}
+	return result;
+}
+
+/**
+ * parse_options - parse reiser4 mount options
+ * @opt_string: starting point
+ * @opts: array of option description
+ * @nr_opts: number of elements in @opts
+ *
+ * Parses comma separated list of reiser4 mount options.
+ */
+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
+{
+	int result;
+
+	result = 0;
+	while ((result == 0) && opt_string && *opt_string) {
+		int j;
+		char *next;
+
+		next = strchr(opt_string, ',');
+		if (next != NULL) {
+			*next = '\0';
+			++next;
+		}
+		for (j = 0; j < nr_opts; ++j) {
+			if (!strncmp(opt_string, opts[j].name,
+				     strlen(opts[j].name))) {
+				result = parse_option(opt_string, &opts[j]);
+				break;
+			}
+		}
+		if (j == nr_opts) {
+			warning("nikita-2307", "Unrecognized option: \"%s\"",
+				opt_string);
+			/* traditionally, -EINVAL is returned on wrong mount
+			   option */
+			result = RETERR(-EINVAL);
+		}
+		opt_string = next;
+	}
+	return result;
+}
+
+#define NUM_OPT( label, fmt, addr )				\
+		{						\
+			.name = ( label ),			\
+			.type = OPT_FORMAT,			\
+			.u = {					\
+				.f = {				\
+					.format  = ( fmt ),	\
+					.nr_args = 1,		\
+					.arg1 = ( addr ),	\
+					.arg2 = NULL,		\
+					.arg3 = NULL,		\
+					.arg4 = NULL		\
+				}				\
+			}					\
+		}
+
+#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
+
+#define BIT_OPT(label, bitnr)					\
+	{							\
+		.name = label,					\
+		.type = OPT_BIT,				\
+		.u = {						\
+			.bit = {				\
+				.nr = bitnr,			\
+				.addr = &sbinfo->fs_flags	\
+			}					\
+		}						\
+	}
+
+#define MAX_NR_OPTIONS (30)
+
+/**
+ * reiser4_init_super_data - initialize reiser4 private super block
+ * @super: super block to initialize
+ * @opt_string: list of reiser4 mount options
+ *
+ * Sets various reiser4 parameters to default values. Parses mount options and
+ * overwrites default settings.
+ */
+int reiser4_init_super_data(struct super_block *super, char *opt_string)
+{
+	int result;
+	struct opt_desc *opts, *p;
+	reiser4_super_info_data *sbinfo = get_super_private(super);
+
+	/* initialize super, export, dentry operations */
+	sbinfo->ops.super = reiser4_super_operations;
+	sbinfo->ops.export = reiser4_export_operations;
+	sbinfo->ops.dentry = reiser4_dentry_operations;
+	super->s_op = &sbinfo->ops.super;
+	super->s_export_op = &sbinfo->ops.export;
+
+	/* initialize transaction manager parameters to default values */
+	sbinfo->tmgr.atom_max_size = totalram_pages / 4;
+	sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
+	sbinfo->tmgr.at