From c88ccea3143975294f5a52097546bcbb75975f52 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 11:27:46 -0500 Subject: [PATCH 001/143] jbd2: Fix return value of jbd2_journal_start_commit() The function jbd2_journal_start_commit() returns 1 if either a transaction is committing or the function has queued a transaction commit. But it returns 0 if we raced with somebody queueing the transaction commit as well. This resulted in ext4_sync_fs() not functioning correctly (description from Arthur Jones): In the case of a data=ordered umount with pending long symlinks which are delayed due to a long list of other I/O on the backing block device, this causes the buffer associated with the long symlinks to not be moved to the inode dirty list in the second phase of fsync_super. Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT flag and the dirty pages are never written to the backing block device, causing long symlink corruption and exposing new or previously freed block data to userspace. This can be reproduced with a script created by Eric Sandeen : #!/bin/bash umount /mnt/test2 mount /dev/sdb4 /mnt/test2 rm -f /mnt/test2/* dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512 touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename /mnt/test2/link umount /mnt/test2 mount /dev/sdb4 /mnt/test2 ls /mnt/test2/ This patch fixes jbd2_journal_start_commit() to always return 1 when there's a transaction committing or queued for commit. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" CC: Eric Sandeen CC: linux-ext4@vger.kernel.org --- fs/jbd2/journal.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index eb343008eded..58144102bf25 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { @@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __jbd2_log_start_commit(journal, tid); - if (ret && ptid) + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); From 9eddacf9e9c03578ef2c07c9534423e823d677f8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 06:46:05 -0500 Subject: [PATCH 002/143] Revert "ext4: wait on all pending commits in ext4_sync_fs()" This undoes commit 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184. Since jbd2_journal_start_commit() is now fixed to return 1 when we started a transaction commit, there's some transaction waiting to be committed or there's a transaction already committing, we don't need to call ext4_force_commit() in ext4_sync_fs(). Furthermore ext4_force_commit() can unnecessarily create sync transaction which is expensive so it's worthwhile to remove it when we can. http://bugzilla.kernel.org/show_bug.cgi?id=12224 Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: Eric Sandeen Cc: linux-ext4@vger.kernel.org --- fs/ext4/super.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5f06a5f045e..a5732c58f676 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb) static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; + tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (EXT4_SB(sb)->s_journal) { - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, + &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, + target); + } } else { ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); } From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 11:15:34 -0500 Subject: [PATCH 003/143] jbd2: Avoid possible NULL dereference in jbd2_journal_begin_ordered_truncate() If we race with commit code setting i_transaction to NULL, we could possibly dereference it. Proper locking requires the journal pointer (to access journal->j_list_lock), which we don't have. So we have to change the prototype of the function so that filesystem passes us the journal pointer. Also add a more detailed comment about why the function jbd2_journal_begin_ordered_truncate() does what it does and how it should be used. Thanks to Dan Carpenter for pointing to the suspitious code. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Acked-by: Joel Becker CC: linux-ext4@vger.kernel.org CC: ocfs2-devel@oss.oracle.com CC: mfasheh@suse.de CC: Dan Carpenter --- fs/ext4/inode.c | 6 ++++-- fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++----------- fs/ocfs2/journal.h | 6 ++++-- include/linux/jbd2.h | 3 ++- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03ba20be1329..658c4a7f2578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,8 +47,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 46b4e347ed7d..28ce21d8598e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2129,26 +2129,46 @@ done: } /* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, loff_t new_size) { - journal_t *journal; - transaction_t *commit_trans; + transaction_t *inode_trans, *commit_trans; int ret = 0; - if (!inode->i_transaction && !inode->i_next_transaction) + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) goto out; - journal = inode->i_transaction->t_journal; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ spin_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c3532e1307c..172850a9a12a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); } #endif /* OCFS2_JOURNAL_H */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b28b37eb11c6..4d248b3f1323 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1150,7 +1150,8 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); +extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); From 7be2baaa0322c59ba888aa5260a8c130666acd41 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 10 Feb 2009 09:53:42 -0500 Subject: [PATCH 004/143] ext4: Fix to read empty directory blocks correctly in 64k The rec_len field in the directory entry is 16 bits, so there was a problem representing rec_len for filesystems with a 64k block size in the case where the directory entry takes the entire 64k block. Unfortunately, there were two schemes that were proposed; one where all zeros meant 65536 and one where all ones (65535) meant 65536. E2fsprogs used 0, whereas the kernel used 65535. Oops. Fortunately this case happens extremely rarely, with the most common case being the lost+found directory, created by mke2fs. So we will be liberal in what we accept, and accept both encodings, but we will continue to encode 65536 as 65535. This will require a change in e2fsprogs, but with fortunately ext4 filesystems normally have the dir_index feature enabled, which precludes having a completely empty directory block. Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index aafc9eba1c25..b0c87dce66a3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen) { unsigned len = le16_to_cpu(dlen); - if (len == EXT4_MAX_REC_LEN) + if (len == EXT4_MAX_REC_LEN || len == 0) return 1 << 16; return len; } From ba4439165f0f0d25b2fe065cf0c1ff8130b802eb Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Feb 2009 11:14:34 -0500 Subject: [PATCH 005/143] ext4: Fix lockdep warning We should not call ext4_mb_add_n_trim while holding alloc_semp. ============================================= [ INFO: possible recursive locking detected ] 2.6.29-rc4-git1-dirty #124 --------------------------------------------- ffsb/3116 is trying to acquire lock: (&meta_group_info[i]->alloc_sem){----}, at: [] ext4_mb_load_buddy+0xd2/0x343 but task is already holding lock: (&meta_group_info[i]->alloc_sem){----}, at: [] ext4_mb_load_buddy+0xd2/0x343 http://bugzilla.kernel.org/show_bug.cgi?id=12672 Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index deba54f6cbed..c962d0690505 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4476,23 +4476,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); - } } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->alloc_semp) up_read(ac->alloc_semp); + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. We need to release + * alloc_semp before calling ext4_mb_add_n_trim() + */ + if (pa->pa_linear && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: [PATCH 006/143] x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++------ include/linux/mm.h | 1 + mm/mlock.c | 7 ++++++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a5df5f82fb9..5a4c23d89892 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - if (unlikely(child->bts)) { - ds_release_bts(child->bts); - child->bts = NULL; - - ptrace_bts_free_buffer(child); - } + /* + * Ptrace_detach() races with ptrace_untrace() in case + * the child dies and is reaped by another thread. + * + * We only do the memory accounting at this point and + * leave the buffer deallocation and the bts tracer + * release to ptrace_bts_untrace() which will be called + * later on with tasklist_lock held. + */ + release_locked_buffer(child->bts_buffer, child->bts_size); } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index e8ddc98b8405..3d7fb44d7d7e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1305,5 +1305,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); +extern void release_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd4..2b57f7e60390 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void free_locked_buffer(void *buffer, size_t size) +void release_locked_buffer(void *buffer, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size) current->mm->locked_vm -= pgsz; up_write(¤t->mm->mmap_sem); +} + +void free_locked_buffer(void *buffer, size_t size) +{ + release_locked_buffer(buffer, size); kfree(buffer); } From 2fff78c784ed97a8e5aa225ef5228f0a6d862d82 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 18:10:10 +0100 Subject: [PATCH 007/143] futex: fix reference leak Catalin noticed that (38d47c1b7075: futex: rely on get_user_pages() for shared futexes) caused an mm_struct leak. Some tracing with the function graph tracer quickly pointed out that futex_wait() has exit paths with unbalanced reference counts. This regression was discovered by kmemleak. Reported-by: Catalin Marinas Signed-off-by: Peter Zijlstra Tested-by: "Pallipadi, Venkatesh" Tested-by: Catalin Marinas Signed-off-by: Ingo Molnar --- kernel/futex.c | 51 ++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index f89d373a9c6d..438701adce23 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1165,6 +1165,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct task_struct *curr = current; + struct restart_block *restart; DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; @@ -1216,11 +1217,13 @@ retry: if (!ret) goto retry; - return ret; + goto out; } ret = -EWOULDBLOCK; - if (uval != val) - goto out_unlock_put_key; + if (unlikely(uval != val)) { + queue_unlock(&q, hb); + goto out_put_key; + } /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1284,38 +1287,38 @@ retry: */ /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; if (!unqueue_me(&q)) - return 0; + goto out_put_key; + ret = -ETIMEDOUT; if (rem) - return -ETIMEDOUT; + goto out_put_key; /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ + ret = -ERESTARTSYS; if (!abs_time) - return -ERESTARTSYS; - else { - struct restart_block *restart; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; - restart->futex.val = val; - restart->futex.time = abs_time->tv64; - restart->futex.bitset = bitset; - restart->futex.flags = 0; + goto out_put_key; - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; - return -ERESTART_RESTARTBLOCK; - } + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = (u32 *)uaddr; + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; + restart->futex.flags = 0; -out_unlock_put_key: - queue_unlock(&q, hb); + if (fshared) + restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; + + ret = -ERESTART_RESTARTBLOCK; + +out_put_key: put_futex_key(fshared, &q.key); - out: return ret; } From 4f06b0436b2ddbd3b67b10e77098a6862787b3eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 11 Feb 2009 09:32:19 -0800 Subject: [PATCH 008/143] x86/cpa: make sure cpa is safe to call in lazy mmu mode Impact: fix race leading to crash under KVM and Xen The CPA code may be called while we're in lazy mmu update mode - for example, when using DEBUG_PAGE_ALLOC and doing a slab allocation in an interrupt handler which interrupted a lazy mmu update. In this case, the in-memory pagetable state may be out of date due to pending queued updates. We need to flush any pending updates before inspecting the page table. Similarly, we must explicitly flush any modifications CPA may have made (which comes down to flushing queued operations when flushing the TLB). Signed-off-by: Jeremy Fitzhardinge Acked-by: Marcelo Tosatti Cc: Stable Kernel Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 84ba74820ad6..f664bc1c4093 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -576,6 +576,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) else address = *cpa->vaddr; + /* + * If we're called with lazy mmu updates enabled, the + * in-memory pte state may be stale. Flush pending updates to + * bring them up to date. + */ + arch_flush_lazy_mmu_mode(); + repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -854,6 +861,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else cpa_flush_all(cache); + /* + * If we've been called with lazy mmu updates enabled, then + * make sure that everything gets flushed out before we + * return. + */ + arch_flush_lazy_mmu_mode(); + out: return ret; } From be03d9e8022030c16abf534e33e185bfc3d40eef Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 11 Feb 2009 11:20:23 -0800 Subject: [PATCH 009/143] x86, pat: fix warn_on_once() while mapping 0-1MB range with /dev/mem Jeff Mahoney reported: > With Suse's hwinfo tool, on -tip: > WARNING: at arch/x86/mm/pat.c:637 reserve_pfn_range+0x5b/0x26d() reserve_pfn_range() is not tracking the memory range below 1MB as non-RAM and as such is inconsistent with similar checks in reserve_memtype() and free_memtype() Rename the pagerange_is_ram() to pat_pagerange_is_ram() and add the "track legacy 1MB region as non RAM" condition. And also, fix reserve_pfn_range() to return -EINVAL, when the pfn range is RAM. This is to be consistent with this API design. Reported-and-tested-by: Jeff Mahoney Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page.h | 1 - arch/x86/mm/ioremap.c | 19 --------- arch/x86/mm/pat.c | 83 ++++++++++++++++++++----------------- 3 files changed, 45 insertions(+), 58 deletions(-) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index e9873a2e8695..776579119a00 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -57,7 +57,6 @@ typedef struct { pgdval_t pgd; } pgd_t; typedef struct { pgprotval_t pgprot; } pgprot_t; extern int page_is_ram(unsigned long pagenr); -extern int pagerange_is_ram(unsigned long start, unsigned long end); extern int devmem_is_allowed(unsigned long pagenr); extern void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index af750ab973b6..f45d5e29a72e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -134,25 +134,6 @@ int page_is_ram(unsigned long pagenr) return 0; } -int pagerange_is_ram(unsigned long start, unsigned long end) -{ - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; - - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - if (page_is_ram(page_nr)) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; - } - - return ram_page; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7b61036427df..aebbf67a79d0 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -211,6 +211,33 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) static struct memtype *cached_entry; static u64 cached_start; +static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && + page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * For RAM pages, mark the pages as non WB memory type using * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or @@ -336,20 +363,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_type) *new_type = actual_type; - /* - * For legacy reasons, some parts of the physical address range in the - * legacy 1MB region is treated as non-RAM (even when listed as RAM in - * the e820 tables). So we will track the memory attributes of this - * legacy 1MB region using the linear memtype_list always. - */ - if (end >= ISA_END_ADDRESS) { - is_range_ram = pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) - return -EINVAL; - } + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, + new_type); + else if (is_range_ram < 0) + return -EINVAL; new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -446,19 +465,11 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; - /* - * For legacy reasons, some parts of the physical address range in the - * legacy 1MB region is treated as non-RAM (even when listed as RAM in - * the e820 tables). So we will track the memory attributes of this - * legacy 1MB region using the linear memtype_list always. - */ - if (end >= ISA_END_ADDRESS) { - is_range_ram = pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) - return -EINVAL; - } + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { @@ -626,17 +637,13 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); - is_ram = pagerange_is_ram(paddr, paddr + size); + is_ram = pat_pagerange_is_ram(paddr, paddr + size); - if (is_ram != 0) { - /* - * For mapping RAM pages, drivers need to call - * set_memory_[uc|wc|wb] directly, for reserve and free, before - * setting up the PTE. - */ - WARN_ON_ONCE(1); - return 0; - } + /* + * reserve_pfn_range() doesn't support RAM pages. + */ + if (is_ram != 0) + return -EINVAL; ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) @@ -693,7 +700,7 @@ static void free_pfn_range(u64 paddr, unsigned long size) { int is_ram; - is_ram = pagerange_is_ram(paddr, paddr + size); + is_ram = pat_pagerange_is_ram(paddr, paddr + size); if (is_ram == 0) free_memtype(paddr, paddr + size); } From a0490fa35dc0022ef95f64802e8edf18c411c790 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 11:35:40 +0100 Subject: [PATCH 010/143] sched: cpu hotplug fix rq_attach_root() does a kfree() with the runqueue lock held. That's not a very wise move, fix it. Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index c1d0ed360088..410eec404133 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6944,20 +6944,26 @@ static void free_rootdomain(struct root_domain *rd) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { + struct root_domain *old_rd = NULL; unsigned long flags; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - struct root_domain *old_rd = rq->rd; + old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); cpumask_clear_cpu(rq->cpu, old_rd->span); - if (atomic_dec_and_test(&old_rd->refcount)) - free_rootdomain(old_rd); + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; } atomic_inc(&rd->refcount); @@ -6968,6 +6974,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + free_rootdomain(old_rd); } static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) From eb099670895f22970cd143875467c2768d6d87e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Feb 2009 09:27:38 -0500 Subject: [PATCH 011/143] Btrfs: make sure all pending extent operations are complete Theres a slight problem with finish_current_insert, if we set all to 1 and then go through and don't actually skip any of the extents on the pending list, we could exit right after we've added new extents. This is a problem because by inserting the new extents we could have gotten new COW's to happen and such, so we may have some pending updates to do or even more inserts to do after that. So this patch will only exit if we have never skipped any of the extents in the pending list, and we have no extents to insert, this will make sure that all of the pending work is truly done before we return. I've been running with this patch for a few days with all of my other testing and have not seen issues. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7527523c2d2d..376656f65b33 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + u64 start; + u64 end; + int ret; + + while(1) { + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + + /* is there more work to do? */ + ret = find_first_extent_bit(&root->fs_info->pending_del, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + ret = find_first_extent_bit(&root->fs_info->extent_ins, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + break; + } return 0; } @@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, u64 end; u64 priv; u64 search = 0; - u64 skipped = 0; struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_path *path; struct pending_extent_op *extent_op, *tmp; struct list_head insert_list, update_list; int ret; - int num_inserts = 0, max_inserts; + int num_inserts = 0, max_inserts, restart = 0; path = btrfs_alloc_path(); INIT_LIST_HEAD(&insert_list); @@ -2233,19 +2249,19 @@ again: ret = find_first_extent_bit(&info->extent_ins, search, &start, &end, EXTENT_WRITEBACK); if (ret) { - if (skipped && all && !num_inserts && + if (restart && !num_inserts && list_empty(&update_list)) { - skipped = 0; + restart = 0; search = 0; continue; } - mutex_unlock(&info->extent_ins_mutex); break; } ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); if (!ret) { - skipped = 1; + if (all) + restart = 1; search = end + 1; if (need_resched()) { mutex_unlock(&info->extent_ins_mutex); @@ -2264,7 +2280,7 @@ again: list_add_tail(&extent_op->list, &insert_list); search = end + 1; if (num_inserts == max_inserts) { - mutex_unlock(&info->extent_ins_mutex); + restart = 1; break; } } else if (extent_op->type == PENDING_BACKREF_UPDATE) { @@ -2280,7 +2296,6 @@ again: * somebody marked this thing for deletion then just unlock it and be * done, the free_extents will handle it */ - mutex_lock(&info->extent_ins_mutex); list_for_each_entry_safe(extent_op, tmp, &update_list, list) { clear_extent_bits(&info->extent_ins, extent_op->bytenr, extent_op->bytenr + extent_op->num_bytes - 1, @@ -2302,6 +2317,10 @@ again: if (!list_empty(&update_list)) { ret = update_backrefs(trans, extent_root, path, &update_list); BUG_ON(ret); + + /* we may have COW'ed new blocks, so lets start over */ + if (all) + restart = 1; } /* @@ -2309,9 +2328,9 @@ again: * need to make sure everything is cleaned then reset everything and * go back to the beginning */ - if (!num_inserts && all && skipped) { + if (!num_inserts && restart) { search = 0; - skipped = 0; + restart = 0; INIT_LIST_HEAD(&update_list); INIT_LIST_HEAD(&insert_list); goto again; @@ -2368,27 +2387,19 @@ again: BUG_ON(ret); /* - * if we broke out of the loop in order to insert stuff because we hit - * the maximum number of inserts at a time we can handle, then loop - * back and pick up where we left off + * if restart is set for whatever reason we need to go back and start + * searching through the pending list again. + * + * We just inserted some extents, which could have resulted in new + * blocks being allocated, which would result in new blocks needing + * updates, so if all is set we _must_ restart to get the updated + * blocks. */ - if (num_inserts == max_inserts) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - num_inserts = 0; - goto again; - } - - /* - * again, if we need to make absolutely sure there are no more pending - * extent operations left and we know that we skipped some, go back to - * the beginning and do it all again - */ - if (all && skipped) { + if (restart || all) { INIT_LIST_HEAD(&insert_list); INIT_LIST_HEAD(&update_list); search = 0; - skipped = 0; + restart = 0; num_inserts = 0; goto again; } @@ -2709,6 +2720,8 @@ again: goto again; } + if (!err) + finish_current_insert(trans, extent_root, 0); return err; } From b288052e1779261ae80138074989ef50358c4e58 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 09:37:35 -0500 Subject: [PATCH 012/143] Btrfs: process mount options on mount -o remount, Btrfs wasn't parsing any new mount options during remount, making it difficult to set mount options on a root drive. Signed-off-by: Chris Mason --- fs/btrfs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f3fd7e2cbc38..66b8341e2dba 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -511,6 +511,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) struct btrfs_root *root = btrfs_sb(sb); int ret; + ret = btrfs_parse_options(root, data); + if (ret) + return -EINVAL; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; From 536ac8ae86e68bb5574d7cc81c7d229a86b82601 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 09:41:38 -0500 Subject: [PATCH 013/143] Btrfs: use larger metadata clusters in ssd mode Larger metadata clusters can significantly improve writeback performance on ssd drives with large erasure blocks. The larger clusters make it more likely a given IO will completely overwrite the ssd block, so it doesn't have to do an internal rwm cycle. On spinning media, lager metadata clusters end up spreading out the metadata more over time, which makes fsck slower, so we don't want this to be the default. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 376656f65b33..c59e12036e20 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2872,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (data & BTRFS_BLOCK_GROUP_METADATA) { last_ptr = &root->fs_info->last_alloc; - empty_cluster = 64 * 1024; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; } if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) From e1df36d2f18254d0690a0fbe036cece74ec311b8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 09:45:08 -0500 Subject: [PATCH 014/143] Btrfs: don't clean old snapshots on sync(1) Cleaning old snapshots can make sync(1) somewhat slow, and some users and applications still use it in a global fsync kind of workload. This patch changes btrfs not to clean old snapshots during sync, which is safe from a FS consistency point of view. The major downside is that it makes it difficult to tell when old snapshots have been reaped and the space they were using has been reclaimed. A new ioctl will be added for this purpose instead. Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 66b8341e2dba..19a4daf03ccb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root); btrfs_wait_ordered_extents(root, 0); - btrfs_clean_old_snapshots(root); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); sb->s_dirt = 0; From b335b0034e252e79ec2e9c6697f5d663c4627bec Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 12 Feb 2009 10:06:04 -0500 Subject: [PATCH 015/143] Btrfs: Avoid using __GFP_HIGHMEM with slab allocator btrfs_releasepage may call kmem_cache_alloc indirectly, and provide same GFP flags it gets to kmem_cache_alloc. So it's possible to use __GFP_HIGHMEM with the slab allocator. Signed-off-by: Yan Zheng --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8f0706210a47..638bcb5e49f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4263,7 +4263,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - return __btrfs_releasepage(page, gfp_flags); + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } static void btrfs_invalidatepage(struct page *page, unsigned long offset) From 7951f3cefbd711f4429a0cd014aa83a844c399a0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 12 Feb 2009 10:06:15 -0500 Subject: [PATCH 016/143] Btrfs: balance_level checks !child after access The BUG_ON() is in the wrong spot. Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35443cc4b9a9..6674692f7023 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -917,9 +917,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); + BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - BUG_ON(!child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); From e00f7308658622fbd483cb0d9fe41165bf9050d0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 12 Feb 2009 14:11:25 -0500 Subject: [PATCH 017/143] Btrfs: remove btrfs_init_path btrfs_init_path was initially used when the path objects were on the stack. Now all the work is done by btrfs_alloc_path and btrfs_init_path isn't required. This patch removes it, and just uses kmem_cache_zalloc to zero out the object. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 ++--------- fs/btrfs/ctree.h | 1 - fs/btrfs/inode-map.c | 1 - fs/btrfs/inode.c | 2 -- 4 files changed, 2 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6674692f7023..c8f4c540cc2c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -inline void btrfs_init_path(struct btrfs_path *p) -{ - memset(p, 0, sizeof(*p)); -} - struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; - path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); - if (path) { - btrfs_init_path(path); + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + if (path) path->reada = 1; - } return path; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 531db112c8bd..3f7a8058df2b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1834,7 +1834,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); -void btrfs_init_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_clear_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2aa79873eb46..cc7334d833c9 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, search_key.type = 0; search_key.offset = 0; - btrfs_init_path(path); start_found = 0; ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); if (ret < 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 638bcb5e49f6..3cee77ae03c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.offset = (u64)-1; key.type = (u8)-1; - btrfs_init_path(path); - search_again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) From a48ddf08ba9bab91efd95e458737afa9d7699623 Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Thu, 12 Feb 2009 14:25:23 -0500 Subject: [PATCH 018/143] Btrfs: remove unused code in split_state() These two lines are not used, remove them. Signed-off-by: Qinghuang Feng Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 37d43b516b79..ebe6b29e6069 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); free_extent_state(prealloc); return -EEXIST; } From 3f3420df505e47751ef76a652b5cb660e5360d6f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 12 Feb 2009 10:16:03 -0500 Subject: [PATCH 019/143] Btrfs: fs/btrfs/volumes.c: remove useless kzalloc The call to kzalloc is followed by a kmalloc whose result is stored in the same variable. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcd14ebccae1..c793b6f50d8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); } - map = kzalloc(sizeof(*map), GFP_NOFS); - if (!map) - return -ENOMEM; - em = alloc_extent_map(GFP_NOFS); if (!em) return -ENOMEM; From 4008c04a07c73ec3cb1be4c1391d2159a8f75d6d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 14:09:45 -0500 Subject: [PATCH 020/143] Btrfs: make a lockdep class for the extent buffer locks Btrfs is currently using spin_lock_nested with a nested value based on the tree depth of the block. But, this doesn't quite work because the max tree depth is bigger than what spin_lock_nested can deal with, and because locks are sometimes taken before the level field is filled in. The solution here is to use lockdep_set_class_and_name instead, and to set the class before unlocking the pages when the block is read from the disk and just after init of a freshly allocated tree block. btrfs_clear_path_blocking is also changed to take the locks in the proper order, and it also makes sure all the locks currently held are properly set to blocking before it tries to retake the spinlocks. Otherwise, lockdep gets upset about bad lock orderin. The lockdep magic cam from Peter Zijlstra Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 45 +++++++++++++++++++++++++++++++---------- fs/btrfs/ctree.h | 10 +++------ fs/btrfs/disk-io.c | 46 +++++++++++++++++++++++++++++++++++++++++- fs/btrfs/disk-io.h | 10 +++++++++ fs/btrfs/extent-tree.c | 7 +++++-- fs/btrfs/locking.c | 11 ---------- fs/btrfs/volumes.c | 2 ++ 7 files changed, 99 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c8f4c540cc2c..42491d728e99 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -62,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) /* * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p) +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) { int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { if (p->nodes[i] && p->locks[i]) btrfs_clear_lock_blocking(p->nodes[i]); } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif } /* this also releases the path */ @@ -279,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, level, &ins); BUG_ON(ret); cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len); + buf->len, level); } else { cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, @@ -1559,7 +1583,7 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); /* * we have a lock on b and as long as we aren't changing @@ -1598,7 +1622,7 @@ cow_done: btrfs_set_path_blocking(p); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); BUG_ON(sret > 0); if (sret) { @@ -1618,7 +1642,7 @@ cow_done: btrfs_set_path_blocking(p); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); if (sret) { ret = sret; @@ -1681,13 +1705,13 @@ cow_done: if (!p->skip_locking) { int lret; - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); lret = btrfs_try_spin_lock(b); if (!lret) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, b); } } } else { @@ -1699,7 +1723,7 @@ cow_done: btrfs_set_path_blocking(p); sret = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); BUG_ON(sret > 0); if (sret) { @@ -3919,7 +3943,6 @@ find_next_key: btrfs_release_path(root, path); goto again; } else { - btrfs_clear_path_blocking(path); goto out; } } @@ -3939,7 +3962,7 @@ find_next_key: path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); - btrfs_clear_path_blocking(path); + btrfs_clear_path_blocking(path, NULL); } out: if (ret == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3f7a8058df2b..766b31ae3186 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -43,11 +43,7 @@ struct btrfs_ordered_sum; #define BTRFS_ACL_NOT_CACHED ((void *)-1) -#ifdef CONFIG_LOCKDEP -# define BTRFS_MAX_LEVEL 7 -#else -# define BTRFS_MAX_LEVEL 8 -#endif +#define BTRFS_MAX_LEVEL 8 /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr, u32 blocksize, + int level); int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 parent, u64 min_bytes, @@ -1835,7 +1832,6 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_clear_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5aebddd71193..adda739a0215 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -75,6 +75,40 @@ struct async_submit_bio { struct btrfs_work work; }; +/* These are used to set the lockdep class on the extent buffer locks. + * The class is set by the readpage_end_io_hook after the buffer has + * passed csum validation but before the pages are unlocked. + * + * The lockdep class is also set by btrfs_init_new_buffer on freshly + * allocated blocks. + * + * The class is based on the level in the tree block, which allows lockdep + * to know that lower nodes nest inside the locks of higher nodes. + * + * We also add a check to make sure the highest level of the tree is + * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this + * code needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif +static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; +static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { + /* leaf */ + "btrfs-extent-00", + "btrfs-extent-01", + "btrfs-extent-02", + "btrfs-extent-03", + "btrfs-extent-04", + "btrfs-extent-05", + "btrfs-extent-06", + "btrfs-extent-07", + /* highest possible level */ + "btrfs-extent-08", +}; +#endif + /* * extents on the btree inode are pretty simple, there's one extent * that covers the entire device @@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +{ + lockdep_set_class_and_name(&eb->lock, + &btrfs_eb_class[level], + btrfs_eb_name[level]); +} +#endif + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); + btrfs_set_buffer_lockdep_class(eb, found_level); + ret = csum_tree_block(root, eb, 1); if (ret) ret = -EIO; @@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); dev_root->track_dirty = 1; - if (ret) goto fail_extent_root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 494a56eb2986..95029db227be 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btree_lock_page_hook(struct page *page); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +#else +static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, + int level) +{ +} +#endif #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c59e12036e20..cd86bffbdc9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3416,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr, u32 blocksize, + int level) { struct extent_buffer *buf; @@ -3424,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); @@ -3467,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, + blocksize, level); return buf; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9ebe9385129b..85506c4a3af7 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,21 +25,10 @@ #include "extent_io.h" #include "locking.h" -/* - * btrfs_header_level() isn't free, so don't call it when lockdep isn't - * on - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void spin_nested(struct extent_buffer *eb) -{ - spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); -} -#else static inline void spin_nested(struct extent_buffer *eb) { spin_lock(&eb->lock); } -#endif /* * Setting a lock to blocking will drop the spinlock and set the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c793b6f50d8d..1316139bf9e8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3102,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(sb, 0); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); From 2456242530a21cfee82646ebeeda65d3f74faa4c Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 12 Feb 2009 14:14:53 -0500 Subject: [PATCH 021/143] Btrfs: hold trans_mutex when using btrfs_record_root_in_trans btrfs_record_root_in_trans needs the trans_mutex held to make sure two callers don't race to setup the root in a given transaction. This adds it to all the places that were missing it. Signed-off-by: Yan Zheng --- fs/btrfs/extent-tree.c | 2 ++ fs/btrfs/transaction.c | 2 ++ fs/btrfs/tree-log.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd86bffbdc9f..0a5d796c9f7e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5658,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, prev_block = block_start; } + mutex_lock(&extent_root->fs_info->trans_mutex); btrfs_record_root_in_trans(found_root); + mutex_unlock(&extent_root->fs_info->trans_mutex); if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { /* * try to update data extent references while diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 919172de5c9a..4112d53d4f4d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, num_bytes -= btrfs_root_used(&dirty->root->root_item); bytes_used = btrfs_root_used(&root->root_item); if (num_bytes) { + mutex_lock(&root->fs_info->trans_mutex); btrfs_record_root_in_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); btrfs_set_root_used(&root->root_item, bytes_used - num_bytes); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 20794290256b..9c462fbd60fa 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2832,7 +2832,9 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; + mutex_lock(&fs_info->trans_mutex); btrfs_record_root_in_trans(wc.replay_dest); + mutex_unlock(&fs_info->trans_mutex); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); From d85cf93da66977dbc645352be1b2084a659d8a0b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 10:02:56 -0800 Subject: [PATCH 022/143] x86/paravirt: make arch_flush_lazy_mmu/cpu disable preemption Impact: avoid access to percpu vars in preempible context They are intended to be used whenever there's the possibility that there's some stale state which is going to be overwritten with a queued update, or to force a state change when we may be in lazy mode. Either way, we could end up calling it with preemption enabled, so wrap the functions in their own little preempt-disable section so they can be safely called in any context (though preemption should never be enabled if we're actually in a lazy state). (Move out of line to avoid #include dependencies.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 17 ++--------------- arch/x86/kernel/paravirt.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ba3e2ff6aedc..a660eceaa273 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1352,14 +1352,7 @@ static inline void arch_leave_lazy_cpu_mode(void) PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); } -static inline void arch_flush_lazy_cpu_mode(void) -{ - if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) { - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } -} - +void arch_flush_lazy_cpu_mode(void); #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) @@ -1372,13 +1365,7 @@ static inline void arch_leave_lazy_mmu_mode(void) PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); } -static inline void arch_flush_lazy_mmu_mode(void) -{ - if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } -} +void arch_flush_lazy_mmu_mode(void); static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, unsigned long phys, pgprot_t flags) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e4c8fb608873..dcba6c567a2a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,30 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void arch_flush_lazy_mmu_mode(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + +void arch_flush_lazy_cpu_mode(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + arch_leave_lazy_cpu_mode(); + arch_enter_lazy_cpu_mode(); + } + + preempt_enable(); +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, From 34b0900d323122113683685b200aae9f9b75e63b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Feb 2009 21:30:48 +0100 Subject: [PATCH 023/143] x86: warn if arch_flush_lazy_mmu_cpu is called in preemptible context Impact: Catch cases where lazy MMU state is active in a preemtible context arch_flush_lazy_mmu_cpu() has been changed to disable preemption so the checks in enter/leave will never trigger. Put the preemtible() check into arch_flush_lazy_mmu_cpu() to catch such cases. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index dcba6c567a2a..c6520a4e85d4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -273,6 +273,7 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } @@ -285,6 +286,7 @@ void arch_flush_lazy_cpu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + WARN_ON(preempt_count() == 1); arch_leave_lazy_cpu_mode(); arch_enter_lazy_cpu_mode(); } From 7ad9de6ac83bd825996d2de98c92e0f425c31050 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Feb 2009 21:16:09 +0100 Subject: [PATCH 024/143] x86: CPA avoid repeated lazy mmu flush Impact: Flush the lazy MMU only once Pending mmu updates only need to be flushed once to bring the in-memory pagetable state up to date. Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f664bc1c4093..8ca0d8566fc8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -575,14 +575,6 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; - - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - */ - arch_flush_lazy_mmu_mode(); - repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -819,6 +811,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, vm_unmap_aliases(); + /* + * If we're called with lazy mmu updates enabled, the + * in-memory pte state may be stale. Flush pending updates to + * bring them up to date. + */ + arch_flush_lazy_mmu_mode(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; From fbc78b07ba53ace155f27491c81a009e541a93ad Mon Sep 17 00:00:00 2001 From: Philippe Gerum Date: Thu, 12 Feb 2009 12:18:46 +0000 Subject: [PATCH 025/143] powerpc/mm: Fix _PAGE_CHG_MASK to protect _PAGE_SPECIAL Fix _PAGE_CHG_MASK so that pte_modify() does not affect the _PAGE_SPECIAL bit. Signed-off-by: Philippe Gerum Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-4k.h | 2 +- arch/powerpc/include/asm/pgtable-64k.h | 2 +- arch/powerpc/include/asm/pgtable-ppc32.h | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-4k.h b/arch/powerpc/include/asm/pgtable-4k.h index 6b18ba9d2d85..1dbca4e7de67 100644 --- a/arch/powerpc/include/asm/pgtable-4k.h +++ b/arch/powerpc/include/asm/pgtable-4k.h @@ -60,7 +60,7 @@ /* It should be preserving the high 48 bits and then specifically */ /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */ #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_HPTEFLAGS) + _PAGE_HPTEFLAGS | _PAGE_SPECIAL) /* Bits to mask out from a PMD to get to the PTE page */ #define PMD_MASKED_BITS 0 diff --git a/arch/powerpc/include/asm/pgtable-64k.h b/arch/powerpc/include/asm/pgtable-64k.h index 07b0d8f09cb6..7389003349a6 100644 --- a/arch/powerpc/include/asm/pgtable-64k.h +++ b/arch/powerpc/include/asm/pgtable-64k.h @@ -114,7 +114,7 @@ static inline struct subpage_prot_table *pgd_subpage_prot(pgd_t *pgd) * pgprot changes */ #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED) + _PAGE_ACCESSED | _PAGE_SPECIAL) /* Bits to mask out from a PMD to get to the PTE page */ #define PMD_MASKED_BITS 0x1ff diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index f69a4d977729..820b5f0a35ce 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -429,7 +429,8 @@ extern int icache_44x_need_flush; #define PMD_PAGE_SIZE(pmd) bad_call_to_PMD_PAGE_SIZE() #endif -#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | \ + _PAGE_SPECIAL) #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ From 06eccea6c34e00c8fedce49229ed35fa84619fb7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 12 Feb 2009 12:36:04 +0000 Subject: [PATCH 026/143] powerpc/mm: Fix numa reserve bootmem page selection Fix the powerpc NUMA reserve bootmem page selection logic. commit 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb (powerpc: Reserve in bootmem lmb reserved regions that cross NUMA nodes) changed the logic for how the powerpc LMB reserved regions were converted to bootmen reserved regions. As the folowing discussion reports, the new logic was not correct. mark_reserved_regions_for_nid() goes through each LMB on the system that specifies a reserved area. It searches for active regions that intersect with that LMB and are on the specified node. It attempts to bootmem-reserve only the area where the active region and the reserved LMB intersect. We can not reserve things on other nodes as they may not have bootmem structures allocated, yet. We base the size of the bootmem reservation on two possible things. Normally, we just make the reservation start and stop exactly at the start and end of the LMB. However, the LMB reservations are not aware of NUMA nodes and on occasion a single LMB may cross into several adjacent active regions. Those may even be on different NUMA nodes and will require separate calls to the bootmem reserve functions. So, the bootmem reservation must be trimmed to fit inside the current active region. That's all fine and dandy, but we trim the reservation in a page-aligned fashion. That's bad because we start the reservation at a non-page-aligned address: physbase. The reservation may only span 2 bytes, but that those bytes may span two pfns and cause a reserve_size of 2*PAGE_SIZE. Take the case where you reserve 0x2 bytes at 0x0fff and where the active region ends at 0x1000. You'll jump into that if() statment, but node_ar.end_pfn=0x1 and start_pfn=0x0. You'll end up with a reserve_size=0x1000, and then call reserve_bootmem_node(node, physbase=0xfff, size=0x1000); 0x1000 may not be on the same node as 0xfff. Oops. In almost all the vm code, end_ is not inclusive. If you have an end_pfn of 0x1234, page 0x1234 is not included in the range. Using PFN_UP instead of the (>> >> PAGE_SHIFT) will make this consistent with the other VM code. We also need to do math for the reserved size with physbase instead of start_pfn. node_ar.end_pfn << PAGE_SHIFT is *precisely* the end of the node. However, (start_pfn << PAGE_SHIFT) is *NOT* precisely the beginning of the reserved area. That is, of course, physbase. If we don't use physbase here, the reserve_size can be made too large. From: Dave Hansen Tested-by: Geoff Levand Tested on PS3. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/numa.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 7393bd76d698..5ac08b8ab654 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -882,7 +883,7 @@ static void mark_reserved_regions_for_nid(int nid) unsigned long physbase = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; unsigned long start_pfn = physbase >> PAGE_SHIFT; - unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); + unsigned long end_pfn = PFN_UP(physbase + size); struct node_active_region node_ar; unsigned long node_end_pfn = node->node_start_pfn + node->node_spanned_pages; @@ -908,7 +909,7 @@ static void mark_reserved_regions_for_nid(int nid) */ if (end_pfn > node_ar.end_pfn) reserve_size = (node_ar.end_pfn << PAGE_SHIFT) - - (start_pfn << PAGE_SHIFT); + - physbase; /* * Only worry about *this* node, others may not * yet have valid NODE_DATA(). From 0047656e2a97d4452dd7df9e52591a7afe21a263 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Thu, 12 Feb 2009 12:36:16 +0000 Subject: [PATCH 027/143] powerpc/ps3: Move ps3_mm_add_memory to device_initcall Change the PS3 hotplug memory routine ps3_mm_add_memory() from a core_initcall to a device_initcall. core_initcall routines run before the powerpc topology_init() startup routine, which is a subsys_initcall, resulting in failure of ps3_mm_add_memory() when CONFIG_NUMA=y. When ps3_mm_add_memory() fails the system will boot with just the 128 MiB of boot memory Signed-off-by: Geoff Levand Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/ps3/mm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index 67de6bf3db3d..d281cc0bca71 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -328,7 +328,7 @@ static int __init ps3_mm_add_memory(void) return result; } -core_initcall(ps3_mm_add_memory); +device_initcall(ps3_mm_add_memory); /*============================================================================*/ /* dma routines */ From 26456dcfb8d8e43b1b64b2a14710694cf7a72f05 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 12 Feb 2009 19:08:58 +0000 Subject: [PATCH 028/143] powerpc/vsx: Fix VSX alignment handler for regs 32-63 Fix the VSX alignment handler for VSX registers > 32. 32-63 are stored in the VMX part of the thread_struct not the FPR part. Signed-off-by: Michael Neuling CC: stable@kernel.org (2.6.27 & .28 please) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/align.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 5af4e9b2dbe2..ada06924a423 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -646,11 +646,16 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, unsigned int areg, struct pt_regs *regs, unsigned int flags, unsigned int length) { - char *ptr = (char *) ¤t->thread.TS_FPR(reg); + char *ptr; int ret = 0; flush_vsx_to_thread(current); + if (reg < 32) + ptr = (char *) ¤t->thread.TS_FPR(reg); + else + ptr = (char *) ¤t->thread.vr[reg - 32]; + if (flags & ST) ret = __copy_to_user(addr, ptr, length); else { From b13e24644c138d0ddbc451403c30a96b09bfd556 Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 12 Feb 2009 18:48:53 -0800 Subject: [PATCH 029/143] x86, hpet: fix for LS21 + HPET = boot hang Between 2.6.23 and 2.6.24-rc1 a change was made that broke IBM LS21 systems that had the HPET enabled in the BIOS, resulting in boot hangs for x86_64. Specifically commit b8ce33590687888ebb900d09557b8807c4539022, which merges the i386 and x86_64 HPET code. Prior to this commit, when we setup the HPET timers in x86_64, we did the following: hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); However after the i386/x86_64 HPET merge, we do the following: cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); However on LS21s with HPET enabled in the BIOS, the HPET_T0_CFG register boots with Level triggered interrupts (HPET_TN_LEVEL) enabled. This causes the periodic interrupt to be not so periodic, and that results in the boot time hang I reported earlier in the delay calibration. My fix: Always disable HPET_TN_LEVEL when setting up periodic mode. Signed-off-by: John Stultz Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 64d5ad0b8add..5c8da2c2c185 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -269,6 +269,8 @@ static void hpet_set_mode(enum clock_event_mode mode, now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); + /* Make sure we use edge triggered interrupts */ + cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); From 3997ad317fdf9ecdb5702e2b4fd1f8229814ff8c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Feb 2009 15:00:52 +0100 Subject: [PATCH 030/143] timers: more consistently use clock vs timer While reviewing the manpages, I noticed I'd missed some clock vs timer sites. Make sure that all timer functions call cpu_timer_sample_group() and not cpu_clock_sample_group(). This ensures that we enable the process wide timer in time, and therefore pay the O(n) thread group cost from the syscall. Not doing it here, will result in the first jiffy tick after setting the timer doing this, resulting in a very expensive tick (but only once) and a delay in actually starting the timer. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 60 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2313a4cc14ea..e976e505648d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -680,6 +680,33 @@ static void cpu_timer_fire(struct k_itimer *timer) } } +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -741,7 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &val); } else { - cpu_clock_sample_group(timer->it_clock, p, &val); + cpu_timer_sample_group(timer->it_clock, p, &val); } if (old) { @@ -889,7 +916,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) read_unlock(&tasklist_lock); goto dead; } else { - cpu_clock_sample_group(timer->it_clock, p, &now); + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead = (unlikely(p->exit_state) && thread_group_empty(p)); } @@ -1244,7 +1271,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) clear_dead_task(timer, now); goto out_unlock; } - cpu_clock_sample_group(timer->it_clock, p, &now); + cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); /* Leave the tasklist_lock locked for the call below. */ } @@ -1408,33 +1435,6 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } -/* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - struct task_cputime cputime; - - thread_group_cputimer(p, &cputime); - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - cpu->cpu = cputime_add(cputime.utime, cputime.stime); - break; - case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; - break; - case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); - break; - } - return 0; -} - /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. From 0b49ec37a20bc7eb7178105aadaa8d1ecba825f8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sun, 8 Feb 2009 20:27:47 -0700 Subject: [PATCH 031/143] PCI/MSI: fix msi_mask() shift fix Hidetoshi Seto points out that commit bffac3c593eba1f9da3efd0199e49ea6558a40ce has wrong values in the array. Rather than correct the array, we can just use a bounds check and perform the calculation specified in the comment. As a bonus, this will not run off the end of the array if the device specifies an illegal value in the MSI capability. Signed-off-by: Matthew Wilcox Signed-off-by: Hidetoshi Seto Signed-off-by: Jesse Barnes --- drivers/pci/msi.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 44f15ff70c1d..baba2eb5367d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -103,14 +103,12 @@ static void msix_set_enable(struct pci_dev *dev, int enable) } } -/* - * Essentially, this is ((1 << (1 << x)) - 1), but without the - * undefinedness of a << 32. - */ static inline __attribute_const__ u32 msi_mask(unsigned x) { - static const u32 mask[] = { 1, 2, 4, 0xf, 0xff, 0xffff, 0xffffffff }; - return mask[x]; + /* Don't shift by >= width of type */ + if (x >= 5) + return 0xffffffff; + return (1 << (1 << x)) - 1; } static void msix_flush_writes(struct irq_desc *desc) From 4cc59c721cba27a4644e29103afac0f91af8da3c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Feb 2009 09:31:20 -0800 Subject: [PATCH 032/143] PCI: fix rom.c kernel-doc warning Fix PCI kernel-doc warning: Warning(linux-2.6.29-rc4-git1/drivers/pci/rom.c:67): No description found for parameter 'pdev' Signed-off-by: Randy Dunlap cc: Jesse Barnes Signed-off-by: Jesse Barnes --- drivers/pci/rom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index 29cbe47f219f..36864a935d68 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -55,6 +55,7 @@ void pci_disable_rom(struct pci_dev *pdev) /** * pci_get_rom_size - obtain the actual size of the ROM image + * @pdev: target PCI device * @rom: kernel virtual pointer to image of ROM * @size: size of PCI window * return: size of actual ROM image From b33bfdef24565fe54da91adf3cd4eea13488d7fc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 9 Jan 2009 17:04:26 -0800 Subject: [PATCH 033/143] PCI: fix struct pci_platform_pm_ops kernel-doc Fix struct pci_platform_pm_ops kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Jesse Barnes --- drivers/pci/pci.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 26ddf78ac300..07c0aa5275e6 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -16,21 +16,21 @@ extern int pci_mmap_fits(struct pci_dev *pdev, int resno, #endif /** - * Firmware PM callbacks + * struct pci_platform_pm_ops - Firmware PM callbacks * - * @is_manageable - returns 'true' if given device is power manageable by the - * platform firmware + * @is_manageable: returns 'true' if given device is power manageable by the + * platform firmware * - * @set_state - invokes the platform firmware to set the device's power state + * @set_state: invokes the platform firmware to set the device's power state * - * @choose_state - returns PCI power state of given device preferred by the - * platform; to be used during system-wide transitions from a - * sleeping state to the working state and vice versa + * @choose_state: returns PCI power state of given device preferred by the + * platform; to be used during system-wide transitions from a + * sleeping state to the working state and vice versa * - * @can_wakeup - returns 'true' if given device is capable of waking up the - * system from a sleeping state + * @can_wakeup: returns 'true' if given device is capable of waking up the + * system from a sleeping state * - * @sleep_wake - enables/disables the system wake up capability of given device + * @sleep_wake: enables/disables the system wake up capability of given device * * If given platform is generally capable of power managing PCI devices, all of * these callbacks are mandatory. From f5ddcac435b6c6133a9c137c345abef53b93cf55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 9 Jan 2009 17:03:20 -0800 Subject: [PATCH 034/143] PCI: fix missing kernel-doc and typos Fix pci kernel-doc parameter missing notation, correct function name, and fix typo: Warning(linux-2.6.28-git10//drivers/pci/pci.c:1511): No description found for parameter 'exclusive' Signed-off-by: Randy Dunlap Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e3efe6b19ee7..6d6120007af4 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1540,16 +1540,21 @@ void pci_release_region(struct pci_dev *pdev, int bar) } /** - * pci_request_region - Reserved PCI I/O and memory resource + * __pci_request_region - Reserved PCI I/O and memory resource * @pdev: PCI device whose resources are to be reserved * @bar: BAR to be reserved * @res_name: Name to be associated with resource. + * @exclusive: whether the region access is exclusive or not * * Mark the PCI region associated with PCI device @pdev BR @bar as * being reserved by owner @res_name. Do not access any * address inside the PCI regions unless this call returns * successfully. * + * If @exclusive is set, then the region is marked so that userspace + * is explicitly not allowed to map the resource via /dev/mem or + * sysfs MMIO access. + * * Returns 0 on success, or %EBUSY on error. A warning * message is also printed on failure. */ @@ -1588,12 +1593,12 @@ err_out: } /** - * pci_request_region - Reserved PCI I/O and memory resource + * pci_request_region - Reserve PCI I/O and memory resource * @pdev: PCI device whose resources are to be reserved * @bar: BAR to be reserved - * @res_name: Name to be associated with resource. + * @res_name: Name to be associated with resource * - * Mark the PCI region associated with PCI device @pdev BR @bar as + * Mark the PCI region associated with PCI device @pdev BAR @bar as * being reserved by owner @res_name. Do not access any * address inside the PCI regions unless this call returns * successfully. From a08f6e04d74478d91f34a0484e69e89870adb33d Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Fri, 13 Feb 2009 12:03:17 -0700 Subject: [PATCH 035/143] PCI: Documentation: fix minor PCIe HOWTO thinko Update doc to correctly refer to replacing the pci_register_driver API, and not the non-existent "pci_module_init" API. Signed-off-by: Alex Chiang Signed-off-by: Jesse Barnes --- Documentation/PCI/PCIEBUS-HOWTO.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/PCI/PCIEBUS-HOWTO.txt b/Documentation/PCI/PCIEBUS-HOWTO.txt index 9a07e38631b0..6bd5f372adec 100644 --- a/Documentation/PCI/PCIEBUS-HOWTO.txt +++ b/Documentation/PCI/PCIEBUS-HOWTO.txt @@ -93,7 +93,7 @@ the PCI Express Port Bus driver from loading a service driver. int pcie_port_service_register(struct pcie_port_service_driver *new) -This API replaces the Linux Driver Model's pci_module_init API. A +This API replaces the Linux Driver Model's pci_register_driver API. A service driver should always calls pcie_port_service_register at module init. Note that after service driver being loaded, calls such as pci_enable_device(dev) and pci_set_master(dev) are no longer From 12d60e28bed3f593aac5385acbdbb089eb8ae21e Mon Sep 17 00:00:00 2001 From: Wim Van Sebroeck Date: Wed, 28 Jan 2009 20:51:04 +0000 Subject: [PATCH 036/143] [WATCHDOG] iTCO_wdt: fix SMI_EN regression 2 bugzilla: #12363 commit 7cd5b08be3c489df11b559fef210b81133764ad4 added a second regression: some Dell's and Compaq's lockup on boot. So we revert most of the code. The ICH9 reboot issue remains in place and will need some more fixing... :-( Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 2 +- drivers/watchdog/iTCO_vendor_support.c | 32 ++++++++++++++++++++--- drivers/watchdog/iTCO_wdt.c | 35 +++++++++++--------------- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 09a3d5522b43..325c10ff6a2c 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -406,7 +406,7 @@ config ITCO_WDT ---help--- Hardware driver for the intel TCO timer based watchdog devices. These drivers are included in the Intel 82801 I/O Controller - Hub family (from ICH0 up to ICH8) and in the Intel 6300ESB + Hub family (from ICH0 up to ICH10) and in the Intel 63xxESB controller hub. The TCO (Total Cost of Ownership) timer is a watchdog timer diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c index 2474ebca88f6..d8264ad0be41 100644 --- a/drivers/watchdog/iTCO_vendor_support.c +++ b/drivers/watchdog/iTCO_vendor_support.c @@ -1,7 +1,7 @@ /* * intel TCO vendor specific watchdog driver support * - * (c) Copyright 2006-2008 Wim Van Sebroeck . + * (c) Copyright 2006-2009 Wim Van Sebroeck . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -19,7 +19,7 @@ /* Module and version information */ #define DRV_NAME "iTCO_vendor_support" -#define DRV_VERSION "1.02" +#define DRV_VERSION "1.03" #define PFX DRV_NAME ": " /* Includes */ @@ -77,6 +77,26 @@ MODULE_PARM_DESC(vendorsupport, "iTCO vendor specific support mode, default=0 (n * 20.6 seconds. */ +static void supermicro_old_pre_start(unsigned long acpibase) +{ + unsigned long val32; + + /* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */ + val32 = inl(SMI_EN); + val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ + outl(val32, SMI_EN); /* Needed to activate watchdog */ +} + +static void supermicro_old_pre_stop(unsigned long acpibase) +{ + unsigned long val32; + + /* Bit 13: TCO_EN -> 1 = Enables the TCO logic to generate SMI# */ + val32 = inl(SMI_EN); + val32 |= 0x00002000; /* Turn on SMI clearing watchdog */ + outl(val32, SMI_EN); /* Needed to deactivate watchdog */ +} + static void supermicro_old_pre_keepalive(unsigned long acpibase) { /* Reload TCO Timer (done in iTCO_wdt_keepalive) + */ @@ -228,14 +248,18 @@ static void supermicro_new_pre_set_heartbeat(unsigned int heartbeat) void iTCO_vendor_pre_start(unsigned long acpibase, unsigned int heartbeat) { - if (vendorsupport == SUPERMICRO_NEW_BOARD) + if (vendorsupport == SUPERMICRO_OLD_BOARD) + supermicro_old_pre_start(acpibase); + else if (vendorsupport == SUPERMICRO_NEW_BOARD) supermicro_new_pre_start(heartbeat); } EXPORT_SYMBOL(iTCO_vendor_pre_start); void iTCO_vendor_pre_stop(unsigned long acpibase) { - if (vendorsupport == SUPERMICRO_NEW_BOARD) + if (vendorsupport == SUPERMICRO_OLD_BOARD) + supermicro_old_pre_stop(acpibase); + else if (vendorsupport == SUPERMICRO_NEW_BOARD) supermicro_new_pre_stop(); } EXPORT_SYMBOL(iTCO_vendor_pre_stop); diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 5b395a4ddfdf..352334947ea3 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -1,7 +1,7 @@ /* - * intel TCO Watchdog Driver (Used in i82801 and i6300ESB chipsets) + * intel TCO Watchdog Driver (Used in i82801 and i63xxESB chipsets) * - * (c) Copyright 2006-2008 Wim Van Sebroeck . + * (c) Copyright 2006-2009 Wim Van Sebroeck . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -63,7 +63,7 @@ /* Module and version information */ #define DRV_NAME "iTCO_wdt" -#define DRV_VERSION "1.04" +#define DRV_VERSION "1.05" #define PFX DRV_NAME ": " /* Includes */ @@ -236,16 +236,16 @@ MODULE_DEVICE_TABLE(pci, iTCO_wdt_pci_tbl); /* Address definitions for the TCO */ /* TCO base address */ -#define TCOBASE iTCO_wdt_private.ACPIBASE + 0x60 +#define TCOBASE iTCO_wdt_private.ACPIBASE + 0x60 /* SMI Control and Enable Register */ -#define SMI_EN iTCO_wdt_private.ACPIBASE + 0x30 +#define SMI_EN iTCO_wdt_private.ACPIBASE + 0x30 #define TCO_RLD TCOBASE + 0x00 /* TCO Timer Reload and Curr. Value */ #define TCOv1_TMR TCOBASE + 0x01 /* TCOv1 Timer Initial Value */ -#define TCO_DAT_IN TCOBASE + 0x02 /* TCO Data In Register */ -#define TCO_DAT_OUT TCOBASE + 0x03 /* TCO Data Out Register */ -#define TCO1_STS TCOBASE + 0x04 /* TCO1 Status Register */ -#define TCO2_STS TCOBASE + 0x06 /* TCO2 Status Register */ +#define TCO_DAT_IN TCOBASE + 0x02 /* TCO Data In Register */ +#define TCO_DAT_OUT TCOBASE + 0x03 /* TCO Data Out Register */ +#define TCO1_STS TCOBASE + 0x04 /* TCO1 Status Register */ +#define TCO2_STS TCOBASE + 0x06 /* TCO2 Status Register */ #define TCO1_CNT TCOBASE + 0x08 /* TCO1 Control Register */ #define TCO2_CNT TCOBASE + 0x0a /* TCO2 Control Register */ #define TCOv2_TMR TCOBASE + 0x12 /* TCOv2 Timer Initial Value */ @@ -338,7 +338,6 @@ static int iTCO_wdt_unset_NO_REBOOT_bit(void) static int iTCO_wdt_start(void) { unsigned int val; - unsigned long val32; spin_lock(&iTCO_wdt_private.io_lock); @@ -351,11 +350,6 @@ static int iTCO_wdt_start(void) return -EIO; } - /* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */ - val32 = inl(SMI_EN); - val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ - outl(val32, SMI_EN); - /* Force the timer to its reload value by writing to the TCO_RLD register */ if (iTCO_wdt_private.iTCO_version == 2) @@ -378,7 +372,6 @@ static int iTCO_wdt_start(void) static int iTCO_wdt_stop(void) { unsigned int val; - unsigned long val32; spin_lock(&iTCO_wdt_private.io_lock); @@ -390,11 +383,6 @@ static int iTCO_wdt_stop(void) outw(val, TCO1_CNT); val = inw(TCO1_CNT); - /* Bit 13: TCO_EN -> 1 = Enables the TCO logic to generate SMI# */ - val32 = inl(SMI_EN); - val32 |= 0x00002000; - outl(val32, SMI_EN); - /* Set the NO_REBOOT bit to prevent later reboots, just for sure */ iTCO_wdt_set_NO_REBOOT_bit(); @@ -649,6 +637,7 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev, int ret; u32 base_address; unsigned long RCBA; + unsigned long val32; /* * Find the ACPI/PM base I/O address which is the base @@ -695,6 +684,10 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev, ret = -EIO; goto out; } + /* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */ + val32 = inl(SMI_EN); + val32 &= 0xffffdfff; /* Turn off SMI clearing watchdog */ + outl(val32, SMI_EN); /* The TCO I/O registers reside in a 32-byte range pointed to by the TCOBASE value */ From d794bf8e0936dce45104565cd48c571061f4c1e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 14 Feb 2009 10:31:16 -0500 Subject: [PATCH 037/143] ext4: Initialize preallocation list_head's properly When creating a new ext4_prealloc_space structure, we have to initialize its list_head pointers before we add them to any prealloc lists. Otherwise, with list debug enabled, we will get list corruption warnings. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c962d0690505..4415beeb0b62 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 0; @@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 1; From 2acf2c261b823d9d9ed954f348b97620297a36b5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 14 Feb 2009 10:42:58 -0500 Subject: [PATCH 038/143] ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages With delayed allocation we lock the page in write_cache_pages() and try to build an in memory extent of contiguous blocks. This is needed so that we can get large contiguous blocks request. If range_cyclic mode is enabled, write_cache_pages() will loop back to the 0 index if no I/O has been done yet, and try to start writing from the beginning of the range. That causes an attempt to take the page lock of lower index page while holding the page lock of higher index page, which can cause a dead lock with another writeback thread. The solution is to implement the range_cyclic behavior in ext4_da_writepages() instead. http://bugzilla.kernel.org/show_bug.cgi?id=12579 Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 658c4a7f2578..cbd2ca99d113 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2439,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); @@ -2490,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - if (wbc->range_cyclic) + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { index = mapping->writeback_index; - else + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + } else index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; @@ -2506,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->no_nrwrite_index_update = 1; pages_skipped = wbc->pages_skipped; +retry: while (!ret && wbc->nr_to_write > 0) { /* @@ -2548,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping, pages_written += mpd.pages_written; wbc->pages_skipped = pages_skipped; ret = 0; + io_done = 1; } else if (wbc->nr_to_write) /* * There is no more writeout needed @@ -2556,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping, */ break; } + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } if (pages_skipped != wbc->pages_skipped) printk(KERN_EMERG "This should not happen leaving %s " "with nr_to_write = %ld ret = %d\n", @@ -2563,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping, /* Update index */ index += pages_written; + wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic From 9af88143b277f52fc6ce0d69137f435c73c39c1a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 13 Feb 2009 23:18:03 +0000 Subject: [PATCH 039/143] iommu: fix Intel IOMMU write-buffer flushing This is the cause of the DMA faults and disk corruption that people have been seeing. Some chipsets neglect to report the RWBF "capability" -- the flag which says that we need to flush the chipset write-buffer when changing the DMA page tables, to ensure that the change is visible to the IOMMU. Override that bit on the affected chipsets, and everything is happy again. Thanks to Chris and Bhavesh and others for helping to debug. Should resolve: https://bugzilla.redhat.com/show_bug.cgi?id=479996 http://bugzilla.kernel.org/show_bug.cgi?id=12578 Signed-off-by: David Woodhouse Cc: Linus Torvalds Tested-and-acked-by: Chris Wright Reviewed-by: Bhavesh Davda Signed-off-by: Ingo Molnar --- drivers/pci/intel-iommu.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f4b7c79023ff..f3f686581a90 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -61,6 +61,8 @@ /* global iommu list, set NULL for ignored DMAR units */ static struct intel_iommu **g_iommus; +static int rwbf_quirk; + /* * 0: Present * 1-11: Reserved @@ -785,7 +787,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) u32 val; unsigned long flag; - if (!cap_rwbf(iommu->cap)) + if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; val = iommu->gcmd | DMA_GCMD_WBF; @@ -3137,3 +3139,15 @@ static struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap_range, .iova_to_phys = intel_iommu_iova_to_phys, }; + +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) +{ + /* + * Mobile 4 Series Chipset neglects to set RWBF capability, + * but needs it: + */ + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n"); + rwbf_quirk = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); From e49590b6dd356f8ef10ba3531a29e5086f6f2e3a Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Fri, 13 Feb 2009 20:56:18 -0500 Subject: [PATCH 040/143] x86, olpc: fix model detection without OFW Impact: fix "garbled display, laptop is unusable" bug Commit e51a1ac2dfca9ad869471e88f828281db7e810c0 ("x86, olpc: fix endian bug in openfirmware workaround") breaks model comparison on OLPC; the value 0xc2 needs to be scaled up by olpc_board(). The pre-patch version was wrong, but accidentally worked anyway (big-endian 0xc2 is big enough to satisfy all other board revisions, but little endian 0xc2 is not). Signed-off-by: Chris Ball Cc: Andrew Morton Acked-by: Andres Salomon Cc: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 7a13fac63a1f..4006c522adc7 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -203,7 +203,7 @@ static void __init platform_detect(void) static void __init platform_detect(void) { /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = 0xc2; + olpc_platform_info.boardrev = olpc_board(0xc2); } #endif From d39123a486524fed9b4e43e08a8757fd90a5859a Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 8 Jan 2009 15:13:31 +0800 Subject: [PATCH 041/143] KVM: ia64: fix fp fault/trap handler The floating-point registers f6-f11 is used by vmm and saved in kvm-pt-regs, so should set the correct bit mask and the pointer in fp_state, otherwise, fpswa may touch vmm's fp registers instead of guests'. In addition, for fp trap handling, since the instruction which leads to fp trap is completely executed, so can't use retry machanism to re-execute it, because it may pollute some registers. Signed-off-by: Yang Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/process.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index 552d07724207..230eae482f32 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -455,13 +455,18 @@ fpswa_ret_t vmm_fp_emulate(int fp_fault, void *bundle, unsigned long *ipsr, if (!vmm_fpswa_interface) return (fpswa_ret_t) {-1, 0, 0, 0}; - /* - * Just let fpswa driver to use hardware fp registers. - * No fp register is valid in memory. - */ memset(&fp_state, 0, sizeof(fp_state_t)); /* + * compute fp_state. only FP registers f6 - f11 are used by the + * vmm, so set those bits in the mask and set the low volatile + * pointer to point to these registers. + */ + fp_state.bitmask_low64 = 0xfc0; /* bit6..bit11 */ + + fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) ®s->f6; + + /* * unsigned long (*EFI_FPSWA) ( * unsigned long trap_type, * void *Bundle, @@ -545,10 +550,6 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim, status = vmm_handle_fpu_swa(0, regs, isr); if (!status) return ; - else if (-EAGAIN == status) { - vcpu_decrement_iip(vcpu); - return ; - } break; } From 7a0eb1960e8ddcb68ea631caf16815485af0e228 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 19 Jan 2009 14:57:52 +0200 Subject: [PATCH 042/143] KVM: Avoid using CONFIG_ in userspace visible headers Kconfig symbols are not available in userspace, and are not stripped by headers-install. Avoid their use by adding #defines in to suit each architecture. Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 4 ++++ arch/x86/include/asm/kvm.h | 7 +++++++ include/linux/kvm.h | 10 +++++----- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index 68aa6da807c1..bfa86b6af7cd 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -25,6 +25,10 @@ #include +/* Select x86 specific features in */ +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index d2e3bf3608af..886c9402ec45 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,13 @@ #include #include +/* Select x86 specific features in */ +#define __KVM_HAVE_PIT +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT +#define __KVM_HAVE_MSI +#define __KVM_HAVE_USER_NMI + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 5715f1907601..0424326f1679 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -58,10 +58,10 @@ struct kvm_irqchip { __u32 pad; union { char dummy[512]; /* reserving space */ -#ifdef CONFIG_X86 +#ifdef __KVM_HAVE_PIT struct kvm_pic_state pic; #endif -#if defined(CONFIG_X86) || defined(CONFIG_IA64) +#ifdef __KVM_HAVE_IOAPIC struct kvm_ioapic_state ioapic; #endif } chip; @@ -384,16 +384,16 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ -#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_ASSIGNMENT 17 #endif #define KVM_CAP_IOMMU 18 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_MSI #define KVM_CAP_DEVICE_MSI 20 #endif /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif From 85db06e514422ae429b5f85742d8111b70bd56f3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Dec 2008 21:23:26 +0100 Subject: [PATCH 043/143] KVM: mmu_notifiers release method The destructor for huge pages uses the backing inode for adjusting hugetlbfs accounting. Hugepage mappings are destroyed by exit_mmap, after mmu_notifier_release, so there are no notifications through unmap_hugepage_range at this point. The hugetlbfs inode can be freed with pages backed by it referenced by the shadow. When the shadow releases its reference, the huge page destructor will access a now freed inode. Implement the release operation for kvm mmu notifiers to release page refs before the hugetlbfs inode is gone. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3a5a08298aab..0b6f2f71271f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -789,11 +789,19 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static void kvm_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + kvm_arch_flush_shadow(kvm); +} + static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_page = kvm_mmu_notifier_invalidate_page, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .release = kvm_mmu_notifier_release, }; #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ From ad8ba2cd44d4d39fb3fe55d5dcc565b19fc3a7fb Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 10:03:02 +0800 Subject: [PATCH 044/143] KVM: Add kvm_arch_sync_events to sync with asynchronize events kvm_arch_sync_events is introduced to quiet down all other events may happen contemporary with VM destroy process, like IRQ handler and work struct for assigned device. For kvm_arch_sync_events is called at the very beginning of kvm_destroy_vm(), so the state of KVM here is legal and can provide a environment to quiet down other events. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 4 ++++ arch/powerpc/kvm/powerpc.c | 4 ++++ arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 4 ++++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 1 + 6 files changed, 18 insertions(+) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 4e586f6110aa..28f982045f29 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1337,6 +1337,10 @@ static void kvm_release_vm_pages(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2822c8ccfaaf..5f81256287f5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -125,6 +125,10 @@ static void kvmppc_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index be8497186b96..0d33893e1e89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -212,6 +212,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cc17546a2406..b0fc079f1bee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4127,6 +4127,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ec49d0be7f52..bf6f703642fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ void kvm_free_physmem(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); +void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0b6f2f71271f..68e3f1ec1674 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -891,6 +891,7 @@ static void kvm_destroy_vm(struct kvm *kvm) { struct mm_struct *mm = kvm->mm; + kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); From ba4cef31d5a397b64ba6d3ff713ce06c62f0c597 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 10:03:03 +0800 Subject: [PATCH 045/143] KVM: Fix racy in kvm_free_assigned_irq In the past, kvm_get_kvm() and kvm_put_kvm() was called in assigned device irq handler and interrupt_work, in order to prevent cancel_work_sync() in kvm_free_assigned_irq got a illegal state when waiting for interrupt_work done. But it's tricky and still got two problems: 1. A bug ignored two conditions that cancel_work_sync() would return true result in a additional kvm_put_kvm(). 2. If interrupt type is MSI, we would got a window between cancel_work_sync() and free_irq(), which interrupt would be injected again... This patch discard the reference count used for irq handler and interrupt_work, and ensure the legal state by moving the free function at the very beginning of kvm_destroy_vm(). And the patch fix the second bug by disable irq before cancel_work_sync(), which may result in nested disable of irq but OK for we are going to free it. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- virt/kvm/kvm_main.c | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b0fc079f1bee..fc3e329f6ade 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4129,11 +4129,11 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + kvm_free_all_assigned_devices(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_free_all_assigned_devices(kvm); kvm_iommu_unmap_guest(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 68e3f1ec1674..277ea7f39fc8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -173,7 +173,6 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) assigned_dev->host_irq_disabled = false; } mutex_unlock(&assigned_dev->kvm->lock); - kvm_put_kvm(assigned_dev->kvm); } static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) @@ -181,8 +180,6 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) struct kvm_assigned_dev_kernel *assigned_dev = (struct kvm_assigned_dev_kernel *) dev_id; - kvm_get_kvm(assigned_dev->kvm); - schedule_work(&assigned_dev->interrupt_work); disable_irq_nosync(irq); @@ -213,6 +210,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) } } +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ static void kvm_free_assigned_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { @@ -228,11 +226,24 @@ static void kvm_free_assigned_irq(struct kvm *kvm, if (!assigned_dev->irq_requested_type) return; - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); + /* + * In kvm_free_device_irq, cancel_work_sync return true if: + * 1. work is scheduled, and then cancelled. + * 2. work callback is executed. + * + * The first one ensured that the irq is disabled and no more events + * would happen. But for the second one, the irq may be enabled (e.g. + * for MSI). So we disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * interrupt_work done. + */ + disable_irq_nosync(assigned_dev->host_irq); + cancel_work_sync(&assigned_dev->interrupt_work); free_irq(assigned_dev->host_irq, (void *)assigned_dev); From d2a8284e8fca9e2a938bee6cd074064d23864886 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 30 Dec 2008 15:55:05 -0200 Subject: [PATCH 046/143] KVM: PIT: fix i8254 pending count read count_load_time assignment is bogus: its supposed to contain what it means, not the expiration time. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e665d1c623ca..72bd275a9b5c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -207,7 +207,7 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) hrtimer_add_expires_ns(&pt->timer, pt->period); pt->scheduled = hrtimer_get_expires_ns(&pt->timer); if (pt->period) - ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer); + ps->channels[0].count_load_time = ktime_get(); return (pt->period == 0 ? 0 : 1); } From abe6655dd699069b53bcccbc65b2717f60203b12 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 10 Feb 2009 20:59:45 -0200 Subject: [PATCH 047/143] KVM: x86: disable kvmclock on non constant TSC hosts This is better. Currently, this code path is posing us big troubles, and we won't have a decent patch in time. So, temporarily disable it. Signed-off-by: Glauber Costa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc3e329f6ade..758b7a155ae9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -967,7 +967,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: - case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: @@ -992,6 +991,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; + case KVM_CAP_CLOCKSOURCE: + r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC); + break; default: r = 0; break; From 2aaf69dcee864f4fb6402638dd2f263324ac839f Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 21 Jan 2009 16:52:16 +0800 Subject: [PATCH 048/143] KVM: MMU: Map device MMIO as UC in EPT Software are not allow to access device MMIO using cacheable memory type, the patch limit MMIO region with UC and WC(guest can select WC using PAT and PCD/PWT). Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- arch/x86/kvm/vmx.c | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 83f11c7474a1..2d4477c71473 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1698,8 +1698,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (largepage) spte |= PT_PAGE_SIZE_MASK; if (mt_mask) { - mt_mask = get_memory_type(vcpu, gfn) << - kvm_x86_ops->get_mt_mask_shift(); + if (!kvm_is_mmio_pfn(pfn)) { + mt_mask = get_memory_type(vcpu, gfn) << + kvm_x86_ops->get_mt_mask_shift(); + mt_mask |= VMX_EPT_IGMT_BIT; + } else + mt_mask = MTRR_TYPE_UNCACHABLE << + kvm_x86_ops->get_mt_mask_shift(); spte |= mt_mask; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6259d7467648..07491c9c6ed0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3687,8 +3687,7 @@ static int __init vmx_init(void) if (vm_need_ept()) { bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | - VMX_EPT_WRITABLE_MASK | - VMX_EPT_IGMT_BIT); + VMX_EPT_WRITABLE_MASK); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); From d7cff1c37664971aae32bb0de82231ed34933d8e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 16:25:10 +0800 Subject: [PATCH 049/143] KVM: Fix INTx for device assignment Missing buckets and wrong parameter for free_irq() Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 277ea7f39fc8..d9bbb20f230f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -296,8 +296,8 @@ static int assigned_device_update_intx(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!msi2intx && - adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) { - free_irq(adev->host_irq, (void *)kvm); + (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) { + free_irq(adev->host_irq, (void *)adev); pci_disable_msi(adev->dev); } From b682b814e3cc340f905c14dff87ce8bdba7c5eba Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 10 Feb 2009 20:41:41 -0200 Subject: [PATCH 050/143] KVM: x86: fix LAPIC pending count calculation Simplify LAPIC TMCCT calculation by using hrtimer provided function to query remaining time until expiration. Fixes host hang with nested ESX. Signed-off-by: Marcelo Tosatti Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.c | 7 ----- arch/x86/kvm/irq.h | 1 - arch/x86/kvm/lapic.c | 64 ++++++++++---------------------------------- arch/x86/kvm/lapic.h | 2 -- arch/x86/kvm/svm.c | 1 - arch/x86/kvm/vmx.c | 1 - 6 files changed, 14 insertions(+), 62 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index c019b8edcdb7..cf17ed52f6fb 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -87,13 +87,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - kvm_apic_timer_intr_post(vcpu, vec); - /* TODO: PIT, RTC etc. */ -} -EXPORT_SYMBOL_GPL(kvm_timer_intr_post); - void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2bf32a03ceec..82579ee538d0 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -89,7 +89,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afac68c0815c..f0b67f2cdd69 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -35,6 +35,12 @@ #include "kvm_cache_regs.h" #include "irq.h" +#ifndef CONFIG_X86_64 +#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) +#else +#define mod_64(x, y) ((x) % (y)) +#endif + #define PRId64 "d" #define PRIx64 "llx" #define PRIu64 "u" @@ -511,52 +517,22 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - u64 counter_passed; - ktime_t passed, now; + ktime_t remaining; + s64 ns; u32 tmcct; ASSERT(apic != NULL); - now = apic->timer.dev.base->get_time(); - tmcct = apic_get_reg(apic, APIC_TMICT); - /* if initial count is 0, current count should also be 0 */ - if (tmcct == 0) + if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - if (unlikely(ktime_to_ns(now) <= - ktime_to_ns(apic->timer.last_update))) { - /* Wrap around */ - passed = ktime_add(( { - (ktime_t) { - .tv64 = KTIME_MAX - - (apic->timer.last_update).tv64}; } - ), now); - apic_debug("time elapsed\n"); - } else - passed = ktime_sub(now, apic->timer.last_update); + remaining = hrtimer_expires_remaining(&apic->timer.dev); + if (ktime_to_ns(remaining) < 0) + remaining = ktime_set(0, 0); - counter_passed = div64_u64(ktime_to_ns(passed), - (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); - - if (counter_passed > tmcct) { - if (unlikely(!apic_lvtt_period(apic))) { - /* one-shot timers stick at 0 until reset */ - tmcct = 0; - } else { - /* - * periodic timers reset to APIC_TMICT when they - * hit 0. The while loop simulates this happening N - * times. (counter_passed %= tmcct) would also work, - * but might be slower or not work on 32-bit?? - */ - while (counter_passed > tmcct) - counter_passed -= tmcct; - tmcct -= counter_passed; - } - } else { - tmcct -= counter_passed; - } + ns = mod_64(ktime_to_ns(remaining), apic->timer.period); + tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); return tmcct; } @@ -653,8 +629,6 @@ static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now = apic->timer.dev.base->get_time(); - apic->timer.last_update = now; - apic->timer.period = apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->timer.divide_count; atomic_set(&apic->timer.pending, 0); @@ -1110,16 +1084,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) } } -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) - apic->timer.last_update = ktime_add_ns( - apic->timer.last_update, - apic->timer.period); -} - int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 81858881287e..45ab6ee71209 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -12,7 +12,6 @@ struct kvm_lapic { atomic_t pending; s64 period; /* unit: ns */ u32 divide_count; - ktime_t last_update; struct hrtimer dev; } timer; struct kvm_vcpu *vcpu; @@ -42,7 +41,6 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); int kvm_lapic_enabled(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1452851ae258..a9e769e4e251 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1600,7 +1600,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) /* Okay, we can deliver the interrupt: grab it and update PIC state. */ intr_vector = kvm_cpu_get_interrupt(vcpu); svm_inject_irq(svm, intr_vector); - kvm_timer_intr_post(vcpu, intr_vector); out: update_cr8_intercept(vcpu); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 07491c9c6ed0..b1fe1422afb1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3285,7 +3285,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) } if (vcpu->arch.interrupt.pending) { vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); if (kvm_cpu_has_interrupt(vcpu)) enable_irq_window(vcpu); } From 682edb4c01e690c7c7cd772dbd6f4e0fd74dc572 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Thu, 5 Feb 2009 18:23:46 +0000 Subject: [PATCH 051/143] KVM: Fix assigned devices circular locking dependency kvm->slots_lock is outer to kvm->lock, so take slots_lock in kvm_vm_ioctl_assign_device() before taking kvm->lock, rather than taking it in kvm_iommu_map_memslots(). Cc: stable@kernel.org Signed-off-by: Mark McLoughlin Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/iommu.c | 6 ++---- virt/kvm/kvm_main.c | 3 +++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e9693a29d00e..4c4037503600 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -73,14 +73,13 @@ static int kvm_iommu_map_memslots(struct kvm *kvm) { int i, r = 0; - down_read(&kvm->slots_lock); for (i = 0; i < kvm->nmemslots; i++) { r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, kvm->memslots[i].npages); if (r) break; } - up_read(&kvm->slots_lock); + return r; } @@ -190,12 +189,11 @@ static void kvm_iommu_put_pages(struct kvm *kvm, static int kvm_iommu_unmap_memslots(struct kvm *kvm) { int i; - down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, kvm->memslots[i].npages); } - up_read(&kvm->slots_lock); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d9bbb20f230f..29a667ce35b0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -466,6 +466,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + down_read(&kvm->slots_lock); mutex_lock(&kvm->lock); match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, @@ -527,6 +528,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, out: mutex_unlock(&kvm->lock); + up_read(&kvm->slots_lock); return r; out_list_del: list_del(&match->list); @@ -538,6 +540,7 @@ out_put: out_free: kfree(match); mutex_unlock(&kvm->lock); + up_read(&kvm->slots_lock); return r; } #endif From 516a1a7e9dc80358030fe01aabb3bedf882db9e2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Feb 2009 02:32:07 +0200 Subject: [PATCH 052/143] KVM: VMX: Flush volatile msrs before emulating rdmsr Some msrs (notable MSR_KERNEL_GS_BASE) are held in the processor registers and need to be flushed to the vcpu struture before they can be read. This fixes cygwin longjmp() failure on Windows x64. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b1fe1422afb1..7611af576829 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -903,6 +903,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; From be716615fe596ee117292dc615e95f707fb67fd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jan 2009 23:36:34 +0100 Subject: [PATCH 053/143] x86, vm86: fix preemption bug Commit 3d2a71a596bd9c761c8487a2178e95f8a61da083 ("x86, traps: converge do_debug handlers") changed the preemption disable logic of do_debug() so vm86_handle_trap() is called with preemption disabled resulting in: BUG: sleeping function called from invalid context at include/linux/kernel.h:155 in_atomic(): 1, irqs_disabled(): 0, pid: 3005, name: dosemu.bin Pid: 3005, comm: dosemu.bin Tainted: G W 2.6.29-rc1 #51 Call Trace: [] copy_to_user+0x33/0x108 [] save_v86_state+0x65/0x149 [] handle_vm86_trap+0x20/0x8f [] do_debug+0x15b/0x1a4 [] debug_stack_correct+0x27/0x2c [] sysenter_do_call+0x12/0x2f BUG: scheduling while atomic: dosemu.bin/3005/0x10000001 Restore the original calling convention and reenable preemption before calling handle_vm86_trap(). Reported-by: Michal Suchanek Cc: stable@kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7932338d7cb3..a9e7548e1790 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -99,6 +99,12 @@ static inline void preempt_conditional_sti(struct pt_regs *regs) local_irq_enable(); } +static inline void conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -626,8 +632,10 @@ clear_dr7: #ifdef CONFIG_X86_32 debug_vm86: + /* reenable preemption: handle_vm86_trap() might sleep */ + dec_preempt_count(); handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); + conditional_cli(regs); return; #endif From d21d52d4a155e36d4dc93d642cd52cb63f7ef91b Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 5 Feb 2009 16:13:32 +0800 Subject: [PATCH 054/143] kbuild,setlocalversion: shorten the make time when using svn Don't bother doing `svn st` as it takes a retarded amount of time when the source is cold Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu Signed-off-by: Sam Ravnborg --- scripts/setlocalversion | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/setlocalversion b/scripts/setlocalversion index f6946cf99ce1..f1c4b35bc324 100755 --- a/scripts/setlocalversion +++ b/scripts/setlocalversion @@ -58,14 +58,7 @@ fi # Check for svn and a svn repo. if rev=`svn info 2>/dev/null | grep '^Last Changed Rev'`; then rev=`echo $rev | awk '{print $NF}'` - changes=`svn status 2>/dev/null | grep '^[AMD]' | wc -l` - - # Are there uncommitted changes? - if [ $changes != 0 ]; then - printf -- '-svn%s%s' "$rev" -dirty - else - printf -- '-svn%s' "$rev" - fi + printf -- '-svn%s' "$rev" # All done with svn exit From fc370ecfdb37b853bd8e2118c7ad9f99fa9ac5cd Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 11 Feb 2009 21:10:57 -0800 Subject: [PATCH 055/143] kbuild: add vmlinux to kernel rpm We are building an automated system to test kernels weekly and need to provide an rpm to our QA dept. We would like to use the ability to create kernel rpms already in the kernel's Makefile, but need the vmlinux file included in the rpm for later debugging. This patch adds a compressed vmlinux to the kernel rpm when doing a make rpm-pkg or binrpm-pkg and upon install places the vmlinux file in /boot. Signed-off-by: Josh Hunt Signed-off-by: Sam Ravnborg --- scripts/package/mkspec | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index 2500886fb90a..ee448cdc6a2b 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -86,6 +86,14 @@ echo "%endif" echo 'cp System.map $RPM_BUILD_ROOT'"/boot/System.map-$KERNELRELEASE" echo 'cp .config $RPM_BUILD_ROOT'"/boot/config-$KERNELRELEASE" + +echo "%ifnarch ppc64" +echo 'cp vmlinux vmlinux.orig' +echo 'bzip2 -9 vmlinux' +echo 'mv vmlinux.bz2 $RPM_BUILD_ROOT'"/boot/vmlinux-$KERNELRELEASE.bz2" +echo 'mv vmlinux.orig vmlinux' +echo "%endif" + echo "" echo "%clean" echo '#echo -rf $RPM_BUILD_ROOT' From 0bb98e231803860e978c302b9faccaf776881137 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 15 Feb 2009 10:20:30 +0100 Subject: [PATCH 056/143] bootgraph: fix for use with dot symbols powerpc has dot symbols, so the dmesg output looks like: <4>[ 0.327310] calling .migration_init+0x0/0x9c @ 1 <4>[ 0.327595] initcall .migration_init+0x0/0x9c returned 1 after 0 usecs The below fixes bootgraph.pl so it handles this correctly. Signed-off-by: Michael Neuling Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Sam Ravnborg --- scripts/bootgraph.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl index b0246307aac4..12caa822a232 100644 --- a/scripts/bootgraph.pl +++ b/scripts/bootgraph.pl @@ -51,7 +51,7 @@ my %pidctr; while (<>) { my $line = $_; - if ($line =~ /([0-9\.]+)\] calling ([a-zA-Z0-9\_]+)\+/) { + if ($line =~ /([0-9\.]+)\] calling ([a-zA-Z0-9\_\.]+)\+/) { my $func = $2; if ($done == 0) { $start{$func} = $1; @@ -87,7 +87,7 @@ while (<>) { $count = $count + 1; } - if ($line =~ /([0-9\.]+)\] initcall ([a-zA-Z0-9\_]+)\+.*returned/) { + if ($line =~ /([0-9\.]+)\] initcall ([a-zA-Z0-9\_\.]+)\+.*returned/) { if ($done == 0) { $end{$2} = $1; $maxtime = $1; From 953fae66d124486c9e284806429c52c5402f59ac Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 11 Feb 2009 13:24:09 -0800 Subject: [PATCH 057/143] kbuild: fix tags generation of config symbols commit 4f628248a578585472e19e4cba2c604643af8c6c aka "kbuild: reintroduce ALLSOURCE_ARCHS support for tags/cscope" breaks tags generation for Kconfig symbols. Steps to reproduce: make tags vi -t PROC_FS It should jump to 'config PROC_FS' line. Signed-off-by: Alexey Dobriyan Tested-by: Pete Wyckoff Signed-off-by: Andrew Morton Signed-off-by: Sam Ravnborg --- scripts/tags.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index fdbe78bb5e2b..7e21e9146118 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -76,7 +76,10 @@ all_sources() all_kconfigs() { - find_sources $ALLSOURCE_ARCHS 'Kconfig*' + for arch in $ALLSOURCE_ARCHS; do + find_sources $arch 'Kconfig*' + done + find_other_sources 'Kconfig*' } all_defconfigs() From 5123b327c107db9e560fd62d50c27a3816e5a078 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 25 Jan 2009 18:39:12 +0530 Subject: [PATCH 058/143] kbuild: add sys_* entries for syscalls in tags Currently, it is no longer possible to use the tags file to jump to system call function definitions with sys_foo, because the definitions are obscured by use of the SYSCALL_DEFINE* macros. This patch adds the appropriate option to ctags to make it see through the macro. Also, it adds the ENTRY() work already done for Exuberant to Emacs too. Signed-off-by: Rabin Vincent Signed-off-by: Sam Ravnborg --- scripts/tags.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index 7e21e9146118..5bd8b1003d44 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -102,7 +102,8 @@ exuberant() -I ____cacheline_internodealigned_in_smp \ -I EXPORT_SYMBOL,EXPORT_SYMBOL_GPL \ --extra=+f --c-kinds=+px \ - --regex-asm='/^ENTRY\(([^)]*)\).*/\1/' + --regex-asm='/^ENTRY\(([^)]*)\).*/\1/' \ + --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' all_kconfigs | xargs $1 -a \ --langdef=kconfig --language-force=kconfig \ @@ -120,7 +121,9 @@ exuberant() emacs() { - all_sources | xargs $1 -a + all_sources | xargs $1 -a \ + --regex='/^ENTRY(\([^)]*\)).*/\1/' \ + --regex='/^SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/sys_\1/' all_kconfigs | xargs $1 -a \ --regex='/^[ \t]*\(\(menu\)*config\)[ \t]+\([a-zA-Z0-9_]+\)/\3/' From c19ef7fd8e534c870166213e9e30de9c44b34a76 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 15 Feb 2009 11:30:52 +0100 Subject: [PATCH 059/143] scripts: add x86 register parser to markup_oops.pl An oops dump also contains the register values. This patch parses these for (32 bit) x86, and then annotates the disassembly with these values; this helps in analysis of the oops by the developer, for example, NULL pointer or other pointer bugs show up clearly this way. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Sam Ravnborg --- scripts/markup_oops.pl | 106 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 6 deletions(-) diff --git a/scripts/markup_oops.pl b/scripts/markup_oops.pl index d40449cafa84..bb2046891c90 100644 --- a/scripts/markup_oops.pl +++ b/scripts/markup_oops.pl @@ -32,6 +32,78 @@ my $module = ""; my $func_offset; my $vmaoffset = 0; +my %regs; + + +sub parse_x86_regs +{ + my ($line) = @_; + if ($line =~ /EAX: ([0-9a-f]+) EBX: ([0-9a-f]+) ECX: ([0-9a-f]+) EDX: ([0-9a-f]+)/) { + $regs{"%eax"} = $1; + $regs{"%ebx"} = $2; + $regs{"%ecx"} = $3; + $regs{"%edx"} = $4; + } + if ($line =~ /ESI: ([0-9a-f]+) EDI: ([0-9a-f]+) EBP: ([0-9a-f]+) ESP: ([0-9a-f]+)/) { + $regs{"%esi"} = $1; + $regs{"%edi"} = $2; + $regs{"%esp"} = $4; + } +} + +sub process_x86_regs +{ + my ($line, $cntr) = @_; + my $str = ""; + if (length($line) < 40) { + return ""; # not an asm istruction + } + + # find the arguments to the instruction + if ($line =~ /([0-9a-zA-Z\,\%\(\)\-\+]+)$/) { + $lastword = $1; + } else { + return ""; + } + + # we need to find the registers that get clobbered, + # since their value is no longer relevant for previous + # instructions in the stream. + + $clobber = $lastword; + # first, remove all memory operands, they're read only + $clobber =~ s/\([a-z0-9\%\,]+\)//g; + # then, remove everything before the comma, thats the read part + $clobber =~ s/.*\,//g; + + # if this is the instruction that faulted, we haven't actually done + # the write yet... nothing is clobbered. + if ($cntr == 0) { + $clobber = ""; + } + + foreach $reg (keys(%regs)) { + my $val = $regs{$reg}; + # first check if we're clobbering this register; if we do + # we print it with a =>, and then delete its value + if ($clobber =~ /$reg/) { + if (length($val) > 0) { + $str = $str . " $reg => $val "; + } + $regs{$reg} = ""; + $val = ""; + } + # now check if we're reading this register + if ($lastword =~ /$reg/) { + if (length($val) > 0) { + $str = $str . " $reg = $val "; + } + } + } + return $str; +} + +# parse the oops while () { my $line = $_; if ($line =~ /EIP: 0060:\[\<([a-z0-9]+)\>\]/) { @@ -46,10 +118,11 @@ while () { if ($line =~ /EIP is at ([a-zA-Z0-9\_]+)\+(0x[0-9a-f]+)\/0x[a-f0-9]+\W\[([a-zA-Z0-9\_\-]+)\]/) { $module = $3; } + parse_x86_regs($line); } my $decodestart = hex($target) - hex($func_offset); -my $decodestop = $decodestart + 8192; +my $decodestop = hex($target) + 8192; if ($target eq "0") { print "No oops found!\n"; print "Usage: \n"; @@ -84,6 +157,7 @@ my $counter = 0; my $state = 0; my $center = 0; my @lines; +my @reglines; sub InRange { my ($address, $target) = @_; @@ -188,16 +262,36 @@ while ($finish < $counter) { my $i; -my $fulltext = ""; + +# start annotating the registers in the asm. +# this goes from the oopsing point back, so that the annotator +# can track (opportunistically) which registers got written and +# whos value no longer is relevant. + +$i = $center; +while ($i >= $start) { + $reglines[$i] = process_x86_regs($lines[$i], $center - $i); + $i = $i - 1; +} + $i = $start; while ($i < $finish) { + my $line; if ($i == $center) { - $fulltext = $fulltext . "*$lines[$i] <----- faulting instruction\n"; + $line = "*$lines[$i] "; } else { - $fulltext = $fulltext . " $lines[$i]\n"; + $line = " $lines[$i] "; } + print $line; + if (defined($reglines[$i]) && length($reglines[$i]) > 0) { + my $c = 60 - length($line); + while ($c > 0) { print " "; $c = $c - 1; }; + print "| $reglines[$i]"; + } + if ($i == $center) { + print "<--- faulting instruction"; + } + print "\n"; $i = $i +1; } -print $fulltext; - From 11df65c3c6f7fdc837a5be8787d31011e8bb93c1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 15 Feb 2009 11:30:55 +0100 Subject: [PATCH 060/143] scripts: add x86 64 bit support to the markup_oops.pl script Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Sam Ravnborg --- scripts/markup_oops.pl | 59 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/scripts/markup_oops.pl b/scripts/markup_oops.pl index bb2046891c90..528492bcba5b 100644 --- a/scripts/markup_oops.pl +++ b/scripts/markup_oops.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl use File::Basename; @@ -29,7 +29,7 @@ my $filename = $vmlinux_name; my $target = "0"; my $function; my $module = ""; -my $func_offset; +my $func_offset = 0; my $vmaoffset = 0; my %regs; @@ -49,6 +49,39 @@ sub parse_x86_regs $regs{"%edi"} = $2; $regs{"%esp"} = $4; } + if ($line =~ /RAX: ([0-9a-f]+) RBX: ([0-9a-f]+) RCX: ([0-9a-f]+)/) { + $regs{"%eax"} = $1; + $regs{"%ebx"} = $2; + $regs{"%ecx"} = $3; + } + if ($line =~ /RDX: ([0-9a-f]+) RSI: ([0-9a-f]+) RDI: ([0-9a-f]+)/) { + $regs{"%edx"} = $1; + $regs{"%esi"} = $2; + $regs{"%edi"} = $3; + } + if ($line =~ /RBP: ([0-9a-f]+) R08: ([0-9a-f]+) R09: ([0-9a-f]+)/) { + $regs{"%r08"} = $2; + $regs{"%r09"} = $3; + } + if ($line =~ /R10: ([0-9a-f]+) R11: ([0-9a-f]+) R12: ([0-9a-f]+)/) { + $regs{"%r10"} = $1; + $regs{"%r11"} = $2; + $regs{"%r12"} = $3; + } + if ($line =~ /R13: ([0-9a-f]+) R14: ([0-9a-f]+) R15: ([0-9a-f]+)/) { + $regs{"%r13"} = $1; + $regs{"%r14"} = $2; + $regs{"%r15"} = $3; + } +} + +sub reg_name +{ + my ($reg) = @_; + $reg =~ s/r(.)x/e\1x/; + $reg =~ s/r(.)i/e\1i/; + $reg =~ s/r(.)p/e\1p/; + return $reg; } sub process_x86_regs @@ -83,10 +116,18 @@ sub process_x86_regs } foreach $reg (keys(%regs)) { + my $clobberprime = reg_name($clobber); + my $lastwordprime = reg_name($lastword); my $val = $regs{$reg}; + if ($val =~ /^[0]+$/) { + $val = "0"; + } else { + $val =~ s/^0*//; + } + # first check if we're clobbering this register; if we do # we print it with a =>, and then delete its value - if ($clobber =~ /$reg/) { + if ($clobber =~ /$reg/ || $clobberprime =~ /$reg/) { if (length($val) > 0) { $str = $str . " $reg => $val "; } @@ -94,7 +135,7 @@ sub process_x86_regs $val = ""; } # now check if we're reading this register - if ($lastword =~ /$reg/) { + if ($lastword =~ /$reg/ || $lastwordprime =~ /$reg/) { if (length($val) > 0) { $str = $str . " $reg = $val "; } @@ -109,15 +150,25 @@ while () { if ($line =~ /EIP: 0060:\[\<([a-z0-9]+)\>\]/) { $target = $1; } + if ($line =~ /RIP: 0010:\[\<([a-z0-9]+)\>\]/) { + $target = $1; + } if ($line =~ /EIP is at ([a-zA-Z0-9\_]+)\+(0x[0-9a-f]+)\/0x[a-f0-9]/) { $function = $1; $func_offset = $2; } + if ($line =~ /RIP: 0010:\[\<[0-9a-f]+\>\] \[\<[0-9a-f]+\>\] ([a-zA-Z0-9\_]+)\+(0x[0-9a-f]+)\/0x[a-f0-9]/) { + $function = $1; + $func_offset = $2; + } # check if it's a module if ($line =~ /EIP is at ([a-zA-Z0-9\_]+)\+(0x[0-9a-f]+)\/0x[a-f0-9]+\W\[([a-zA-Z0-9\_\-]+)\]/) { $module = $3; } + if ($line =~ /RIP: 0010:\[\<[0-9a-f]+\>\] \[\<[0-9a-f]+\>\] ([a-zA-Z0-9\_]+)\+(0x[0-9a-f]+)\/0x[a-f0-9]+\W\[([a-zA-Z0-9\_\-]+)\]/) { + $module = $3; + } parse_x86_regs($line); } From 929799973ba4a40f7b8001e9cc461c13d04c4124 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 10 Jan 2009 04:56:13 +0100 Subject: [PATCH 061/143] kbuild: create the source symlink earlier in the objdir It's useful to already have the source symlink in a objdir when one just runs make *config. Then one can do mkdir obj-allyes cd obj-allyes make -C ../sourcedir O=$(pwd) allyesconfig ./source/scripts/config --disable debug_info make CC=icecc -j18 without having to interrupt the make first just to get the source symlink. Signed-off-by: Andi Kleen [sam: deleted the other source symlink statement] Signed-off-by: Sam Ravnborg --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 22d758495ad2..59cf6dacec07 100644 --- a/Makefile +++ b/Makefile @@ -389,6 +389,7 @@ PHONY += outputmakefile # output directory. outputmakefile: ifneq ($(KBUILD_SRC),) + $(Q)ln -fsn $(srctree) source $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkmakefile \ $(srctree) $(objtree) $(VERSION) $(PATCHLEVEL) endif @@ -946,7 +947,6 @@ ifneq ($(KBUILD_SRC),) mkdir -p include2; \ ln -fsn $(srctree)/include/asm-$(SRCARCH) include2/asm; \ fi - ln -fsn $(srctree) source endif # prepare2 creates a makefile if using a separate output directory From 090542641de833c6f756895fc2f139f046e298f9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 15 Feb 2009 20:02:19 -0500 Subject: [PATCH 062/143] ext4: Fix NULL dereference in ext4_ext_migrate()'s error handling This was found through a code checker (http://repo.or.cz/w/smatch.git/). It looks like you might be able to trigger the error by trying to migrate a readonly file system. Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/migrate.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 734abca25e35..fe64d9f79852 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - goto err_out; + return retval; } tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, @@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode) if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); - tmp_inode = NULL; - goto err_out; + return retval; } i_size_write(tmp_inode, i_size_read(inode)); /* @@ -618,8 +617,7 @@ err_out: ext4_journal_stop(handle); - if (tmp_inode) - iput(tmp_inode); + iput(tmp_inode); return retval; } From a0abd520fd69295f4a3735e29a9448a32e101d47 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Feb 2009 17:31:58 -0600 Subject: [PATCH 063/143] cpumask: fix powernow-k8: partial revert of 2fdf66b491ac706657946442789ec644cc317e1a Impact: fix powernow-k8 when acpi=off (or other error). There was a spurious change introduced into powernow-k8 in this patch: so that we try to "restore" the cpus_allowed we never saved. We revert that file. See lkml "[PATCH] x86/powernow: fix cpus_allowed brokage when acpi=off" from Yinghai for the bug report. Cc: Mike Travis Cc: Yinghai Lu Signed-off-by: Rusty Russell Acked-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index fb039cd345d8..6428aa17b40e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1157,8 +1157,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) data->cpu = pol->cpu; data->currpstate = HW_PSTATE_INVALID; - rc = powernow_k8_cpu_init_acpi(data); - if (rc) { + if (powernow_k8_cpu_init_acpi(data)) { /* * Use the PSB BIOS structure. This is only availabe on * an UP version, and is deprecated by AMD. @@ -1176,17 +1175,20 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI maintainers and complain to your BIOS " "vendor.\n"); #endif - goto err_out; + kfree(data); + return -ENODEV; } if (pol->cpu != 0) { printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " "CPU other than CPU0. Complain to your BIOS " "vendor.\n"); - goto err_out; + kfree(data); + return -ENODEV; } rc = find_psb_table(data); if (rc) { - goto err_out; + kfree(data); + return -ENODEV; } /* Take a crude guess here. * That guess was in microseconds, so multiply with 1000 */ From 1371be0f7c8f6141b2dbfde6a7ae7885bedb9834 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Feb 2009 17:31:59 -0600 Subject: [PATCH 064/143] cpumask: Use cpu_*_mask accessors code: alpha Impact: use new API, fix SMP bug. Use the new accessors rather than frobbing bits directly. This also removes the bug introduced in ee0c468b (alpha: compile fixes) which had Alpha setting bits on an on-stack cpumask, not the cpu_online_map. Cc: Richard Henderson Cc: FUJITA Tomonori Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ivan Kokshaysky Acked-by: Ingo Molnar --- arch/alpha/kernel/process.c | 8 ++++---- arch/alpha/kernel/smp.c | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index f238370c907d..8d0097f10208 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -93,8 +93,8 @@ common_shutdown_1(void *generic_ptr) if (cpuid != boot_cpuid) { flags |= 0x00040000UL; /* "remain halted" */ *pflags = flags; - cpu_clear(cpuid, cpu_present_map); - cpu_clear(cpuid, cpu_possible_map); + set_cpu_present(cpuid, false); + set_cpu_possible(cpuid, false); halt(); } #endif @@ -120,8 +120,8 @@ common_shutdown_1(void *generic_ptr) #ifdef CONFIG_SMP /* Wait for the secondaries to halt. */ - cpu_clear(boot_cpuid, cpu_present_map); - cpu_clear(boot_cpuid, cpu_possible_map); + set_cpu_present(boot_cpuid, false); + set_cpu_possible(boot_cpuid, false); while (cpus_weight(cpu_present_map)) barrier(); #endif diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 00f1dc3dfd5f..b1fe5674c3a1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -120,12 +120,12 @@ void __cpuinit smp_callin(void) { int cpuid = hard_smp_processor_id(); - cpumask_t mask = cpu_online_map; - if (cpu_test_and_set(cpuid, mask)) { + if (cpu_online(cpuid)) { printk("??, cpu 0x%x already present??\n", cpuid); BUG(); } + set_cpu_online(cpuid, true); /* Turn on machine checks. */ wrmces(7); @@ -436,8 +436,8 @@ setup_smp(void) ((char *)cpubase + i*hwrpb->processor_size); if ((cpu->flags & 0x1cc) == 0x1cc) { smp_num_probed++; - cpu_set(i, cpu_possible_map); - cpu_set(i, cpu_present_map); + set_cpu_possible(i, true); + set_cpu_present(i, true); cpu->pal_revision = boot_cpu_palrev; } @@ -470,8 +470,8 @@ smp_prepare_cpus(unsigned int max_cpus) /* Nothing to do on a UP box, or when told not to. */ if (smp_num_probed == 1 || max_cpus == 0) { - cpu_possible_map = cpumask_of_cpu(boot_cpuid); - cpu_present_map = cpumask_of_cpu(boot_cpuid); + init_cpu_possible(cpumask_of(boot_cpuid)); + init_cpu_present(cpumask_of(boot_cpuid)); printk(KERN_INFO "SMP mode deactivated.\n"); return; } From a3c1239eb59c0a907f8be5587d42e950f44543f8 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 16 Feb 2009 14:37:12 +0000 Subject: [PATCH 065/143] wusb: whci-hcd: always lock whc->lock with interrupts disabled Always lock whc->lock with spin_lock_irq() or spin_lock_irqsave(). Signed-off-by: David Vrabel --- drivers/usb/host/whci/asl.c | 4 ++-- drivers/usb/host/whci/pzl.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/whci/asl.c b/drivers/usb/host/whci/asl.c index 2291c5f5af51..958751ccea43 100644 --- a/drivers/usb/host/whci/asl.c +++ b/drivers/usb/host/whci/asl.c @@ -227,13 +227,13 @@ void scan_async_work(struct work_struct *work) * Now that the ASL is updated, complete the removal of any * removed qsets. */ - spin_lock(&whc->lock); + spin_lock_irq(&whc->lock); list_for_each_entry_safe(qset, t, &whc->async_removed_list, list_node) { qset_remove_complete(whc, qset); } - spin_unlock(&whc->lock); + spin_unlock_irq(&whc->lock); } /** diff --git a/drivers/usb/host/whci/pzl.c b/drivers/usb/host/whci/pzl.c index 7dc85a0bee7c..df8b85f07092 100644 --- a/drivers/usb/host/whci/pzl.c +++ b/drivers/usb/host/whci/pzl.c @@ -255,13 +255,13 @@ void scan_periodic_work(struct work_struct *work) * Now that the PZL is updated, complete the removal of any * removed qsets. */ - spin_lock(&whc->lock); + spin_lock_irq(&whc->lock); list_for_each_entry_safe(qset, t, &whc->periodic_removed_list, list_node) { qset_remove_complete(whc, qset); } - spin_unlock(&whc->lock); + spin_unlock_irq(&whc->lock); } /** From d1b3525b4126d7acad0493b62642b80b71442661 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 15 Feb 2009 23:24:24 +0400 Subject: [PATCH 066/143] libata-sff: fix 32-bit PIO ATAPI regression Commit 871af1210f13966ab911ed2166e4ab2ce775b99d (libata: Add 32bit PIO support) has caused all kinds of errors on the ATAPI devices, so it has been empirically proven that one shouldn't try to read/write an extra data word when a device is not expecting it already. "Don't do it then"; however, still use a chance to do 32-bit read/write one last time when there are exactly 3 trailing bytes. Oh, and stop pointlessly swapping the bytes to and fro on big-endian machines by using io*_rep() accessors which shouldn't byte-swap. This patch should fix the kernel.org bug #12609. Signed-off-by: Sergei Shtylyov Signed-off-by: Jeff Garzik --- drivers/ata/libata-sff.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 0b299b0f8172..714cb046b594 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -773,18 +773,32 @@ unsigned int ata_sff_data_xfer32(struct ata_device *dev, unsigned char *buf, else iowrite32_rep(data_addr, buf, words); + /* Transfer trailing bytes, if any */ if (unlikely(slop)) { - __le32 pad; + unsigned char pad[4]; + + /* Point buf to the tail of buffer */ + buf += buflen - slop; + + /* + * Use io*_rep() accessors here as well to avoid pointlessly + * swapping bytes to and fro on the big endian machines... + */ if (rw == READ) { - pad = cpu_to_le32(ioread32(ap->ioaddr.data_addr)); - memcpy(buf + buflen - slop, &pad, slop); + if (slop < 3) + ioread16_rep(data_addr, pad, 1); + else + ioread32_rep(data_addr, pad, 1); + memcpy(buf, pad, slop); } else { - memcpy(&pad, buf + buflen - slop, slop); - iowrite32(le32_to_cpu(pad), ap->ioaddr.data_addr); + memcpy(pad, buf, slop); + if (slop < 3) + iowrite16_rep(data_addr, pad, 1); + else + iowrite32_rep(data_addr, pad, 1); } - words++; } - return words << 2; + return (buflen + 1) & ~1; } EXPORT_SYMBOL_GPL(ata_sff_data_xfer32); From 7dac745b8e367c99175b8f0d014d996f0e5ed9e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 Feb 2009 10:34:32 +0900 Subject: [PATCH 067/143] sata_nv: give up hardreset on nf2 Kernel bz#12176 reports that nf2 hardreset simply doesn't work. Give up. Argh... Signed-off-by: Tejun Heo Cc: Robert Hancock Reported-by: Saro Signed-off-by: Jeff Garzik --- drivers/ata/sata_nv.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index 444af0415ca1..55a8eed3f3a3 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -421,19 +421,21 @@ static struct ata_port_operations nv_generic_ops = { .hardreset = ATA_OP_NULL, }; -/* OSDL bz3352 reports that nf2/3 controllers can't determine device - * signature reliably. Also, the following thread reports detection - * failure on cold boot with the standard debouncing timing. +/* nf2 is ripe with hardreset related problems. + * + * kernel bz#3352 reports nf2/3 controllers can't determine device + * signature reliably. The following thread reports detection failure + * on cold boot with the standard debouncing timing. * * http://thread.gmane.org/gmane.linux.ide/34098 * - * Debounce with hotplug timing and request follow-up SRST. + * And bz#12176 reports that hardreset simply doesn't work on nf2. + * Give up on it and just don't do hardreset. */ static struct ata_port_operations nv_nf2_ops = { - .inherits = &nv_common_ops, + .inherits = &nv_generic_ops, .freeze = nv_nf2_freeze, .thaw = nv_nf2_thaw, - .hardreset = nv_noclassify_hardreset, }; /* For initial probing after boot and hot plugging, hardreset mostly From 720fd66dfad1b0286721dbb2ed4d6076c0aa953b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 4 Feb 2009 20:44:01 +0100 Subject: [PATCH 068/143] mfd: Fix egpio kzalloc return test Since ei is already known to be non-NULL, I assume that what was intended was to test the result of kzalloc. Signed-off-by: Julia Lawall Signed-off-by: Samuel Ortiz --- drivers/mfd/htc-egpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index 1a4d04664d6d..194df7b8256c 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -307,7 +307,7 @@ static int __init egpio_probe(struct platform_device *pdev) ei->nchips = pdata->num_chips; ei->chip = kzalloc(sizeof(struct egpio_chip) * ei->nchips, GFP_KERNEL); - if (!ei) { + if (!ei->chip) { ret = -ENOMEM; goto fail; } From 62571c29a8343839e85e741db6a489f30686697c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Feb 2009 20:49:52 +0100 Subject: [PATCH 069/143] mfd: Initialise WM8350 interrupts earlier Ensure that the interrupt handling is configured before we do platform specific init. This allows the platform specific initialisation to configure things which use interrupts safely. Signed-off-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/mfd/wm8350-core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c index f92595c8f165..70f5e7739546 100644 --- a/drivers/mfd/wm8350-core.c +++ b/drivers/mfd/wm8350-core.c @@ -1404,15 +1404,6 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq, return ret; } - if (pdata && pdata->init) { - ret = pdata->init(wm8350); - if (ret != 0) { - dev_err(wm8350->dev, "Platform init() failed: %d\n", - ret); - goto err; - } - } - mutex_init(&wm8350->auxadc_mutex); mutex_init(&wm8350->irq_mutex); INIT_WORK(&wm8350->irq_work, wm8350_irq_worker); @@ -1430,6 +1421,15 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq, } wm8350->chip_irq = irq; + if (pdata && pdata->init) { + ret = pdata->init(wm8350); + if (ret != 0) { + dev_err(wm8350->dev, "Platform init() failed: %d\n", + ret); + goto err; + } + } + wm8350_reg_write(wm8350, WM8350_SYSTEM_INTERRUPTS_MASK, 0x0); wm8350_client_dev_register(wm8350, "wm8350-codec", From 85c93ea7dca475a6ee3bf414befe94b2c42f1001 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Feb 2009 21:09:38 +0100 Subject: [PATCH 070/143] mfd: Improve diagnostics for WM8350 ID register probe Check the return value of the device I/O functions when reading the ID registers so we can provide a more useful diagnostic when we're having trouble talking to the device. Signed-off-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/mfd/wm8350-core.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c index 70f5e7739546..e5e82c7cc523 100644 --- a/drivers/mfd/wm8350-core.c +++ b/drivers/mfd/wm8350-core.c @@ -1297,14 +1297,29 @@ static void wm8350_client_dev_register(struct wm8350 *wm8350, int wm8350_device_init(struct wm8350 *wm8350, int irq, struct wm8350_platform_data *pdata) { - int ret = -EINVAL; + int ret; u16 id1, id2, mask_rev; u16 cust_id, mode, chip_rev; /* get WM8350 revision and config mode */ - wm8350->read_dev(wm8350, WM8350_RESET_ID, sizeof(id1), &id1); - wm8350->read_dev(wm8350, WM8350_ID, sizeof(id2), &id2); - wm8350->read_dev(wm8350, WM8350_REVISION, sizeof(mask_rev), &mask_rev); + ret = wm8350->read_dev(wm8350, WM8350_RESET_ID, sizeof(id1), &id1); + if (ret != 0) { + dev_err(wm8350->dev, "Failed to read ID: %d\n", ret); + goto err; + } + + ret = wm8350->read_dev(wm8350, WM8350_ID, sizeof(id2), &id2); + if (ret != 0) { + dev_err(wm8350->dev, "Failed to read ID: %d\n", ret); + goto err; + } + + ret = wm8350->read_dev(wm8350, WM8350_REVISION, sizeof(mask_rev), + &mask_rev); + if (ret != 0) { + dev_err(wm8350->dev, "Failed to read revision: %d\n", ret); + goto err; + } id1 = be16_to_cpu(id1); id2 = be16_to_cpu(id2); From a39a021fd73ce06aad8d1081ac711a36930e6cb8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Feb 2009 21:10:58 +0100 Subject: [PATCH 071/143] mfd: Mark WM835x USB_SLV_500MA bit as accessible The code is out of sync with the silicon. Signed-off-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/mfd/wm8350-regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/wm8350-regmap.c b/drivers/mfd/wm8350-regmap.c index 68887b817d17..9a4cc954cb7c 100644 --- a/drivers/mfd/wm8350-regmap.c +++ b/drivers/mfd/wm8350-regmap.c @@ -3188,7 +3188,7 @@ const struct wm8350_reg_access wm8350_reg_io_map[] = { { 0x7CFF, 0x0C00, 0x7FFF }, /* R1 - ID */ { 0x0000, 0x0000, 0x0000 }, /* R2 */ { 0xBE3B, 0xBE3B, 0x8000 }, /* R3 - System Control 1 */ - { 0xFCF7, 0xFCF7, 0xF800 }, /* R4 - System Control 2 */ + { 0xFEF7, 0xFEF7, 0xF800 }, /* R4 - System Control 2 */ { 0x80FF, 0x80FF, 0x8000 }, /* R5 - System Hibernate */ { 0xFB0E, 0xFB0E, 0x0000 }, /* R6 - Interface Control */ { 0x0000, 0x0000, 0x0000 }, /* R7 */ From 29c6a2e6f88225ae2673aabd2de0fa2126653231 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Feb 2009 21:23:22 +0100 Subject: [PATCH 072/143] mfd: wm8350 tries reaches -1 With a postfix decrement tries will reach -1 rather than 0, so the warning will not be issued even upon timeout. Signed-off-by: Roel Kluin Acked-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/mfd/wm8350-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c index e5e82c7cc523..ea3801833176 100644 --- a/drivers/mfd/wm8350-core.c +++ b/drivers/mfd/wm8350-core.c @@ -1111,7 +1111,7 @@ int wm8350_read_auxadc(struct wm8350 *wm8350, int channel, int scale, int vref) do { schedule_timeout_interruptible(1); reg = wm8350_reg_read(wm8350, WM8350_DIGITISER_CONTROL_1); - } while (tries-- && (reg & WM8350_AUXADC_POLL)); + } while (--tries && (reg & WM8350_AUXADC_POLL)); if (!tries) dev_err(wm8350->dev, "adc chn %d read timeout\n", channel); From a313d758cc7956d7f1e7a727c8fa571b6468fabf Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Feb 2009 21:26:07 +0100 Subject: [PATCH 073/143] mfd: Fix TWL4030 build on some ARM variants Many ARM platforms do not provide a mach/cpu.h so rather than guarding the use of that header with CONFIG_ARM guard it with the guards used when testing for the OMAP variants in the body of the code. Signed-off-by: Mark Brown Acked-by: David Brownell Signed-off-by: Samuel Ortiz --- drivers/mfd/twl4030-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c index e7ab0035d305..68826f1e36bc 100644 --- a/drivers/mfd/twl4030-core.c +++ b/drivers/mfd/twl4030-core.c @@ -38,7 +38,7 @@ #include #include -#ifdef CONFIG_ARM +#if defined(CONFIG_ARCH_OMAP2) || defined(CONFIG_ARCH_OMAP3) #include #endif From 9427c34bc72f05b519e8466f27c38a3327bae157 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 4 Feb 2009 21:27:48 +0100 Subject: [PATCH 074/143] mfd: fix htc-egpio iomem resource handling using resource_size Fixes an off-by-one error in the iomem resource mapping. Signed-off-by: Philipp Zabel Signed-off-by: Samuel Ortiz --- drivers/mfd/htc-egpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index 194df7b8256c..aa266e1f69b2 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -286,7 +286,7 @@ static int __init egpio_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) goto fail; - ei->base_addr = ioremap_nocache(res->start, res->end - res->start); + ei->base_addr = ioremap_nocache(res->start, resource_size(res)); if (!ei->base_addr) goto fail; pr_debug("EGPIO phys=%08x virt=%p\n", (u32)res->start, ei->base_addr); From 2f161f4485535df85451a8cfdf2487c315f665f5 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 6 Feb 2009 15:28:15 +0100 Subject: [PATCH 075/143] mfd: Ensure all WM8350 IRQs are masked at startup The IRQs might have been left enabled in hardware, generating spurious IRQs before the drivers have registered. Signed-off-by: Mark Brown Signed-off-by: Samuel Ortiz --- drivers/mfd/wm8350-core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c index ea3801833176..84d5ea1ec171 100644 --- a/drivers/mfd/wm8350-core.c +++ b/drivers/mfd/wm8350-core.c @@ -1419,6 +1419,13 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq, return ret; } + wm8350_reg_write(wm8350, WM8350_SYSTEM_INTERRUPTS_MASK, 0xFFFF); + wm8350_reg_write(wm8350, WM8350_INT_STATUS_1_MASK, 0xFFFF); + wm8350_reg_write(wm8350, WM8350_INT_STATUS_2_MASK, 0xFFFF); + wm8350_reg_write(wm8350, WM8350_UNDER_VOLTAGE_INT_STATUS_MASK, 0xFFFF); + wm8350_reg_write(wm8350, WM8350_GPIO_INT_STATUS_MASK, 0xFFFF); + wm8350_reg_write(wm8350, WM8350_COMPARATOR_INT_STATUS_MASK, 0xFFFF); + mutex_init(&wm8350->auxadc_mutex); mutex_init(&wm8350->irq_mutex); INIT_WORK(&wm8350->irq_work, wm8350_irq_worker); From 8915e5402809ae6228e15c76417351dad752826e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 17 Feb 2009 09:07:02 +0100 Subject: [PATCH 076/143] mfd: terminate pcf50633 i2c_device_id list The i2c_device_id list is supposed to be zero-terminated. Signed-off-by: Jean Delvare Cc: Balaji Rao Signed-off-by: Samuel Ortiz Signed-off-by: Andrew Morton --- drivers/mfd/pcf50633-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c index ea9488e7ad6d..2e36057659e1 100644 --- a/drivers/mfd/pcf50633-core.c +++ b/drivers/mfd/pcf50633-core.c @@ -678,6 +678,7 @@ static int __devexit pcf50633_remove(struct i2c_client *client) static struct i2c_device_id pcf50633_id_table[] = { {"pcf50633", 0x73}, + {/* end of list */} }; static struct i2c_driver pcf50633_driver = { From 158abca5f699a047ff7b67a64ab19e8ec824e37d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 17 Feb 2009 09:10:19 +0100 Subject: [PATCH 077/143] mfd: fix sm501 section mismatches drv => driver renaming is needed otherwise modpost will spit false positives re pointing to __devinit function from regular data. Signed-off-by: Alexey Dobriyan Cc: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Samuel Ortiz --- drivers/mfd/sm501.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index 0e5761f12634..6c3786f8e9a8 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -1321,7 +1321,7 @@ static unsigned int sm501_mem_local[] = { * Common init code for an SM501 */ -static int sm501_init_dev(struct sm501_devdata *sm) +static int __devinit sm501_init_dev(struct sm501_devdata *sm) { struct sm501_initdata *idata; struct sm501_platdata *pdata; @@ -1397,7 +1397,7 @@ static int sm501_init_dev(struct sm501_devdata *sm) return 0; } -static int sm501_plat_probe(struct platform_device *dev) +static int __devinit sm501_plat_probe(struct platform_device *dev) { struct sm501_devdata *sm; int ret; @@ -1586,8 +1586,8 @@ static struct sm501_platdata sm501_pci_platdata = { .gpio_base = -1, }; -static int sm501_pci_probe(struct pci_dev *dev, - const struct pci_device_id *id) +static int __devinit sm501_pci_probe(struct pci_dev *dev, + const struct pci_device_id *id) { struct sm501_devdata *sm; int err; @@ -1693,7 +1693,7 @@ static void sm501_dev_remove(struct sm501_devdata *sm) sm501_gpio_remove(sm); } -static void sm501_pci_remove(struct pci_dev *dev) +static void __devexit sm501_pci_remove(struct pci_dev *dev) { struct sm501_devdata *sm = pci_get_drvdata(dev); @@ -1727,16 +1727,16 @@ static struct pci_device_id sm501_pci_tbl[] = { MODULE_DEVICE_TABLE(pci, sm501_pci_tbl); -static struct pci_driver sm501_pci_drv = { +static struct pci_driver sm501_pci_driver = { .name = "sm501", .id_table = sm501_pci_tbl, .probe = sm501_pci_probe, - .remove = sm501_pci_remove, + .remove = __devexit_p(sm501_pci_remove), }; MODULE_ALIAS("platform:sm501"); -static struct platform_driver sm501_plat_drv = { +static struct platform_driver sm501_plat_driver = { .driver = { .name = "sm501", .owner = THIS_MODULE, @@ -1749,14 +1749,14 @@ static struct platform_driver sm501_plat_drv = { static int __init sm501_base_init(void) { - platform_driver_register(&sm501_plat_drv); - return pci_register_driver(&sm501_pci_drv); + platform_driver_register(&sm501_plat_driver); + return pci_register_driver(&sm501_pci_driver); } static void __exit sm501_base_exit(void) { - platform_driver_unregister(&sm501_plat_drv); - pci_unregister_driver(&sm501_pci_drv); + platform_driver_unregister(&sm501_plat_driver); + pci_unregister_driver(&sm501_pci_driver); } module_init(sm501_base_init); From dcd9651ecd652a186dd9ad0dde76d43320b9c0a2 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 17 Feb 2009 09:21:52 +0100 Subject: [PATCH 078/143] mfd: Fix sm501_register_gpio section mismatch WARNING: drivers/mfd/built-in.o(.text+0x1706): Section mismatch in reference from the function sm501_register_gpio() to the function .devinit.text:sm501_gpio_register_chip() The function sm501_register_gpio() references the function __devinit sm501_gpio_register_chip(). This is often because sm501_register_gpio lacks a __devinit annotation or the annotation of sm501_gpio_register_chip is wrong. Signed-off-by: Rakib Mullick Signed-off-by: Samuel Ortiz --- drivers/mfd/sm501.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index 6c3786f8e9a8..4c7b7962f6b8 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -1050,7 +1050,7 @@ static int __devinit sm501_gpio_register_chip(struct sm501_devdata *sm, return gpiochip_add(gchip); } -static int sm501_register_gpio(struct sm501_devdata *sm) +static int __devinit sm501_register_gpio(struct sm501_devdata *sm) { struct sm501_gpio *gpio = &sm->gpio; resource_size_t iobase = sm->io_res->start + SM501_GPIO; From 8eb2dfac41c71701bb741f496f0cb7b7e4a3c3f6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 17 Feb 2009 20:00:11 +0800 Subject: [PATCH 079/143] crypto: lrw - Fix big endian support It turns out that LRW has never worked properly on big endian. This was never discussed because nobody actually used it that way. In fact, it was only discovered when Geert Uytterhoeven loaded it through tcrypt which failed the test on it. The fix is straightforward, on big endian the to find the nth bit we should be grouping them by words instead of bytes. So setbit128_bbe should xor with 128 - BITS_PER_LONG instead of 128 - BITS_PER_BYTE == 0x78. Tested-by: Geert Uytterhoeven Signed-off-by: Herbert Xu --- crypto/lrw.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 8ef664e3bcd9..358f80be2bf9 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -45,7 +45,13 @@ struct priv { static inline void setbit128_bbe(void *b, int bit) { - __set_bit(bit ^ 0x78, b); + __set_bit(bit ^ (0x80 - +#ifdef __BIG_ENDIAN + BITS_PER_LONG +#else + BITS_PER_BYTE +#endif + ), b); } static int setkey(struct crypto_tfm *parent, const u8 *key, From 35cfd1d964f3c2420862f2167a0eb85ff1208999 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Sun, 1 Feb 2009 16:11:04 +0100 Subject: [PATCH 080/143] HID: blacklist Powercom USB UPS For quite some time users with various UPSes from Powercom were forced to play magic with bind/unbind in /sys in order to be able to see the UPSes. The beasts does not work as HID devices, even if claims to do so. cypress_m8 driver works with the devices instead, creating a normal serial port with which normal UPS controlling software works. The manufacturer confirmed the upcoming models with proper HID support will have different device IDs. In any way, it's wrong to have two completely different modules for one device in kernel. Blacklist the device in HID (add it to hid_ignore_list) to stop this mess, finally. Signed-off-By: Michael Tokarev Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 1 + drivers/hid/hid-ids.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 6cad69ed21c5..0e96b1fcc14b 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1605,6 +1605,7 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_PANJIT, 0x0002) }, { HID_USB_DEVICE(USB_VENDOR_ID_PANJIT, 0x0003) }, { HID_USB_DEVICE(USB_VENDOR_ID_PANJIT, 0x0004) }, + { HID_USB_DEVICE(USB_VENDOR_ID_POWERCOM, USB_DEVICE_ID_POWERCOM_UPS) }, { HID_USB_DEVICE(USB_VENDOR_ID_SOUNDGRAPH, USB_DEVICE_ID_SOUNDGRAPH_IMON_LCD) }, { HID_USB_DEVICE(USB_VENDOR_ID_SOUNDGRAPH, USB_DEVICE_ID_SOUNDGRAPH_IMON_LCD2) }, { HID_USB_DEVICE(USB_VENDOR_ID_SOUNDGRAPH, USB_DEVICE_ID_SOUNDGRAPH_IMON_LCD3) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index e899f510ebeb..88511970508d 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -348,6 +348,9 @@ #define USB_VENDOR_ID_PLAYDOTCOM 0x0b43 #define USB_DEVICE_ID_PLAYDOTCOM_EMS_USBII 0x0003 +#define USB_VENDOR_ID_POWERCOM 0x0d9f +#define USB_DEVICE_ID_POWERCOM_UPS 0x0002 + #define USB_VENDOR_ID_SAITEK 0x06a3 #define USB_DEVICE_ID_SAITEK_RUMBLEPAD 0xff17 From dfd395aff4cb16d2480b280064bea1b19ededef1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 3 Feb 2009 16:35:17 +0300 Subject: [PATCH 081/143] HID: unlock properly on error paths in hidraw_ioctl() We can't return immediately because lock_kernel() is held. Signed-off-by: Dan Carpenter Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 732449628971..02b19db5442e 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -267,8 +267,10 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, default: { struct hid_device *hid = dev->hid; - if (_IOC_TYPE(cmd) != 'H' || _IOC_DIR(cmd) != _IOC_READ) - return -EINVAL; + if (_IOC_TYPE(cmd) != 'H' || _IOC_DIR(cmd) != _IOC_READ) { + ret = -EINVAL; + break; + } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { int len; @@ -277,8 +279,9 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); - return copy_to_user(user_arg, hid->name, len) ? + ret = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; + break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { @@ -288,12 +291,13 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); - return copy_to_user(user_arg, hid->phys, len) ? + ret = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; + break; } } - ret = -ENOTTY; + ret = -ENOTTY; } unlock_kernel(); return ret; From daedb3d6a91f9626ab4c921378ac52e44de833d5 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Sat, 14 Feb 2009 11:45:05 +0200 Subject: [PATCH 082/143] HID: move tmff and zpff devices from ignore_list to blacklist The devices handled by hid-tmff and hid-zpff were added in the hid_ignore_list[] instead of hid_blacklist[] in hid-core.c, thus disabling them completely. hid_ignore_list[] causes hid layer to skip the device, while hid_blacklist[] indicates there is a specific driver in hid bus. Re-enable the devices by moving them to the correct list. Signed-off-by: Anssi Hannula Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 0e96b1fcc14b..1cc967448f4d 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1300,7 +1300,13 @@ static const struct hid_device_id hid_blacklist[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGX_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_SUNPLUS, USB_DEVICE_ID_SUNPLUS_WDESKTOP) }, + { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb300) }, + { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb304) }, + { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb651) }, + { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb654) }, { HID_USB_DEVICE(USB_VENDOR_ID_TOPSEED, USB_DEVICE_ID_TOPSEED_CYBERLINK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0005) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0030) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, 0x030c) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PRESENTER_8K_BT) }, @@ -1613,10 +1619,6 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SOUNDGRAPH, USB_DEVICE_ID_SOUNDGRAPH_IMON_LCD5) }, { HID_USB_DEVICE(USB_VENDOR_ID_TENX, USB_DEVICE_ID_TENX_IBUDDY1) }, { HID_USB_DEVICE(USB_VENDOR_ID_TENX, USB_DEVICE_ID_TENX_IBUDDY2) }, - { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb300) }, - { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb304) }, - { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb651) }, - { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb654) }, { HID_USB_DEVICE(USB_VENDOR_ID_VERNIER, USB_DEVICE_ID_VERNIER_LABPRO) }, { HID_USB_DEVICE(USB_VENDOR_ID_VERNIER, USB_DEVICE_ID_VERNIER_GOTEMP) }, { HID_USB_DEVICE(USB_VENDOR_ID_VERNIER, USB_DEVICE_ID_VERNIER_SKIP) }, @@ -1627,8 +1629,6 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_1_PHIDGETSERVO_20) }, { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_8_8_4_IF_KIT) }, { HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0005) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0030) }, { } }; From 2b639386a2a26c84c8d26c649cf657ebd43a7bc8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 17 Feb 2009 12:38:36 +0100 Subject: [PATCH 083/143] HID: fix bus endianity in file2alias Fix endianness of bus member of hid_device_id in modpost. Signed-off-by: Jiri Slaby Reported-by: Nye Liu Cc: Jiri Kosina Signed-off-by: Jiri Kosina --- scripts/mod/file2alias.c | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 491b8b1b6abf..4eea60b1693e 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -210,6 +210,7 @@ static void do_usb_table(void *symval, unsigned long size, static int do_hid_entry(const char *filename, struct hid_device_id *id, char *alias) { + id->bus = TO_NATIVE(id->bus); id->vendor = TO_NATIVE(id->vendor); id->product = TO_NATIVE(id->product); From f63145e28934a0b3ad8baf31adc195ec81423351 Mon Sep 17 00:00:00 2001 From: Trent Piepho Date: Sat, 24 Jan 2009 20:52:41 -0300 Subject: [PATCH 084/143] V4L/DVB (10516a): zoran: Update MAINTAINERS entry Ronald Bultje hasn't been maintaining the zoran driver for some time. Re-direct people to the mailing lists and web pages. MAINTAINERS | 6 +++--- Signed-off-by: Trent Piepho Acked-by: Jean Delvare Acked-by: Ronald S. Bultje Signed-off-by: Mauro Carvalho Chehab --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index db65b4e6d132..53b50c8d5f38 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4924,11 +4924,11 @@ L: zd1211-devs@lists.sourceforge.net (subscribers-only) S: Maintained ZR36067 VIDEO FOR LINUX DRIVER -P: Ronald Bultje -M: rbultje@ronald.bitfreak.net L: mjpeg-users@lists.sourceforge.net +L: linux-media@vger.kernel.org W: http://mjpeg.sourceforge.net/driver-zoran/ -S: Maintained +T: Mercurial http://linuxtv.org/hg/v4l-dvb +S: Odd Fixes ZS DECSTATION Z85C30 SERIAL DRIVER P: Maciej W. Rozycki From ef88f2b563275d156beebc9f76ed134f3f90f210 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 13 Feb 2009 08:24:34 -0300 Subject: [PATCH 085/143] V4L/DVB (10527): tuner: fix TUV1236D analog/digital setup As reported by David Engel , ATSC115 doesn't work fine with mythtv. This software opens both analog and dvb interfaces of saa7134. What happens is that some tuner commands are going to the wrong place, as shown at the logs: Feb 12 20:37:48 opus kernel: tuner-simple 1-0061: using tuner params #0 (ntsc) Feb 12 20:37:48 opus kernel: tuner-simple 1-0061: freq = 67.25 (1076), range = 0, config = 0xce, cb = 0x01 Feb 12 20:37:48 opus kernel: tuner-simple 1-0061: Freq= 67.25 MHz, V_IF=45.75 MHz, Offset=0.00 MHz, div=1808 Feb 12 20:37:48 opus kernel: tuner 1-0061: tv freq set to 67.25 Feb 12 20:37:48 opus kernel: tuner-simple 1-000a: using tuner params #0 (ntsc) Feb 12 20:37:48 opus kernel: tuner-simple 1-000a: freq = 67.25 (1076), range = 0, config = 0xce, cb = 0x01 Feb 12 20:37:48 opus kernel: tuner-simple 1-000a: Freq= 67.25 MHz, V_IF=45.75 MHz, Offset=0.00 MHz, div=1808 Feb 12 20:37:48 opus kernel: tuner-simple 1-000a: tv 0x07 0x10 0xce 0x01 Feb 12 20:37:48 opus kernel: tuner-simple 1-0061: tv 0x07 0x10 0xce 0x01 This happens due to a hack at TUV1236D analog setup, where it replaces tuner address, at 0x61 for 0x0a, in order to save a few memory bytes. The code assumes that nobody else would try to access the tuner during that setup, but the point is that there's no lock to protect such access. So, this opens the possibility of race conditions to happen. Instead of hacking tuner address, this patch uses a temporary var with the proper tuner value to be used during the setup. This should save the issue, although we should consider to write some analog/digital lock at saa7134 driver. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/common/tuners/tuner-simple.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/media/common/tuners/tuner-simple.c b/drivers/media/common/tuners/tuner-simple.c index de7adaf5fa5b..78412c9c424a 100644 --- a/drivers/media/common/tuners/tuner-simple.c +++ b/drivers/media/common/tuners/tuner-simple.c @@ -318,7 +318,6 @@ static int simple_std_setup(struct dvb_frontend *fe, u8 *config, u8 *cb) { struct tuner_simple_priv *priv = fe->tuner_priv; - u8 tuneraddr; int rc; /* tv norm specific stuff for multi-norm tuners */ @@ -387,6 +386,7 @@ static int simple_std_setup(struct dvb_frontend *fe, case TUNER_PHILIPS_TUV1236D: { + struct tuner_i2c_props i2c = priv->i2c_props; /* 0x40 -> ATSC antenna input 1 */ /* 0x48 -> ATSC antenna input 2 */ /* 0x00 -> NTSC antenna input 1 */ @@ -398,17 +398,15 @@ static int simple_std_setup(struct dvb_frontend *fe, buffer[1] = 0x04; } /* set to the correct mode (analog or digital) */ - tuneraddr = priv->i2c_props.addr; - priv->i2c_props.addr = 0x0a; - rc = tuner_i2c_xfer_send(&priv->i2c_props, &buffer[0], 2); + i2c.addr = 0x0a; + rc = tuner_i2c_xfer_send(&i2c, &buffer[0], 2); if (2 != rc) tuner_warn("i2c i/o error: rc == %d " "(should be 2)\n", rc); - rc = tuner_i2c_xfer_send(&priv->i2c_props, &buffer[2], 2); + rc = tuner_i2c_xfer_send(&i2c, &buffer[2], 2); if (2 != rc) tuner_warn("i2c i/o error: rc == %d " "(should be 2)\n", rc); - priv->i2c_props.addr = tuneraddr; break; } } From d807dec59d3c850b332b5bf95fe33f18def00068 Mon Sep 17 00:00:00 2001 From: Tobias Lorenz Date: Thu, 12 Feb 2009 14:56:10 -0300 Subject: [PATCH 086/143] V4L/DVB (10532): Correction of Stereo detection/setting and signal strength indication Thanks to Bob Ross - correction of stereo detection/setting - correction of signal strength indicator scaling Signed-off-by: Tobias Lorenz Signed-off-by: Mauro Carvalho Chehab --- drivers/media/radio/radio-si470x.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/media/radio/radio-si470x.c b/drivers/media/radio/radio-si470x.c index 67cbce82cb91..fd81a97a0fdb 100644 --- a/drivers/media/radio/radio-si470x.c +++ b/drivers/media/radio/radio-si470x.c @@ -98,6 +98,9 @@ * - blacklisted KWorld radio in hid-core.c and hid-ids.h * 2008-12-03 Mark Lord * - add support for DealExtreme USB Radio + * 2009-01-31 Bob Ross + * - correction of stereo detection/setting + * - correction of signal strength indicator scaling * * ToDo: * - add firmware download/update support @@ -1385,20 +1388,22 @@ static int si470x_vidioc_g_tuner(struct file *file, void *priv, }; /* stereo indicator == stereo (instead of mono) */ - if ((radio->registers[STATUSRSSI] & STATUSRSSI_ST) == 1) - tuner->rxsubchans = V4L2_TUNER_SUB_MONO | V4L2_TUNER_SUB_STEREO; - else + if ((radio->registers[STATUSRSSI] & STATUSRSSI_ST) == 0) tuner->rxsubchans = V4L2_TUNER_SUB_MONO; + else + tuner->rxsubchans = V4L2_TUNER_SUB_MONO | V4L2_TUNER_SUB_STEREO; /* mono/stereo selector */ - if ((radio->registers[POWERCFG] & POWERCFG_MONO) == 1) - tuner->audmode = V4L2_TUNER_MODE_MONO; - else + if ((radio->registers[POWERCFG] & POWERCFG_MONO) == 0) tuner->audmode = V4L2_TUNER_MODE_STEREO; + else + tuner->audmode = V4L2_TUNER_MODE_MONO; /* min is worst, max is best; signal:0..0xffff; rssi: 0..0xff */ - tuner->signal = (radio->registers[STATUSRSSI] & STATUSRSSI_RSSI) - * 0x0101; + /* measured in units of dbµV in 1 db increments (max at ~75 dbµV) */ + tuner->signal = (radio->registers[STATUSRSSI] & STATUSRSSI_RSSI); + /* the ideal factor is 0xffff/75 = 873,8 */ + tuner->signal = (tuner->signal * 873) + (8 * tuner->signal / 10); /* automatic frequency control: -1: freq to low, 1 freq to high */ /* AFCRL does only indicate that freq. differs, not if too low/high */ From 2f94fc465a6504443bb986ba9d36e28e2b422c6e Mon Sep 17 00:00:00 2001 From: Tobias Lorenz Date: Thu, 12 Feb 2009 14:56:19 -0300 Subject: [PATCH 087/143] V4L/DVB (10533): fix LED status output This patch closes one of my todos that was since long on my list. Some people reported clicks and glitches in the audio stream, correlated to the LED color changing cycle. Thanks to Rick Bronson . Signed-off-by: Tobias Lorenz Signed-off-by: Mauro Carvalho Chehab --- drivers/media/radio/radio-si470x.c | 34 +++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/media/radio/radio-si470x.c b/drivers/media/radio/radio-si470x.c index fd81a97a0fdb..4dfed6aa2dbc 100644 --- a/drivers/media/radio/radio-si470x.c +++ b/drivers/media/radio/radio-si470x.c @@ -101,11 +101,13 @@ * 2009-01-31 Bob Ross * - correction of stereo detection/setting * - correction of signal strength indicator scaling + * 2009-01-31 Rick Bronson + * Tobias Lorenz + * - add LED status output * * ToDo: * - add firmware download/update support * - RDS support: interrupt mode, instead of polling - * - add LED status output (check if that's not already done in firmware) */ @@ -884,6 +886,30 @@ static int si470x_rds_on(struct si470x_device *radio) +/************************************************************************** + * General Driver Functions - LED_REPORT + **************************************************************************/ + +/* + * si470x_set_led_state - sets the led state + */ +static int si470x_set_led_state(struct si470x_device *radio, + unsigned char led_state) +{ + unsigned char buf[LED_REPORT_SIZE]; + int retval; + + buf[0] = LED_REPORT; + buf[1] = LED_COMMAND; + buf[2] = led_state; + + retval = si470x_set_report(radio, (void *) &buf, sizeof(buf)); + + return (retval < 0) ? -EINVAL : 0; +} + + + /************************************************************************** * RDS Driver Functions **************************************************************************/ @@ -1637,6 +1663,9 @@ static int si470x_usb_driver_probe(struct usb_interface *intf, /* set initial frequency */ si470x_set_freq(radio, 87.5 * FREQ_MUL); /* available in all regions */ + /* set led to connect state */ + si470x_set_led_state(radio, BLINK_GREEN_LED); + /* rds buffer allocation */ radio->buf_size = rds_buf * 3; radio->buffer = kmalloc(radio->buf_size, GFP_KERNEL); @@ -1720,6 +1749,9 @@ static void si470x_usb_driver_disconnect(struct usb_interface *intf) cancel_delayed_work_sync(&radio->work); usb_set_intfdata(intf, NULL); if (radio->users == 0) { + /* set led to disconnect state */ + si470x_set_led_state(radio, BLINK_ORANGE_LED); + video_unregister_device(radio->videodev); kfree(radio->buffer); kfree(radio); From 28100165c3f27f681fee8b60e4e44f64a739c454 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Feb 2009 15:27:44 -0300 Subject: [PATCH 088/143] V4L/DVB (10572): Revert commit dda06a8e4610757def753ee3a541a0b1a1feb36b On Mon, 02 Feb 2009, Hartmut wrote: This change set is wrong. The affected functions cannot be called from an interrupt context, because they may process large buffers. In this case, interrupts are disabled for a long time. Functions, like dvb_dmx_swfilter_packets(), could be called only from a tasklet. This change set does hide some strong design bugs in dm1105.c and au0828-dvb.c. Please revert this change set and do fix the bugs in dm1105.c and au0828-dvb.c (and other files). On Sun, 15 Feb 2009, Oliver Endriss wrote: This changeset _must_ be reverted! It breaks all kernels since 2.6.27 for applications which use DVB and require a low interrupt latency. It is a very bad idea to call the demuxer to process data buffers with interrupts disabled! On Mon, 16 Feb 2009, Trent Piepho wrote: I agree, this is bad. The demuxer is far too much work to be done with IRQs off. IMHO, even doing it under a spin-lock is excessive. It should be a mutex. Drivers should use a work-queue to feed the demuxer. Thank you for testing this changeset and discovering the issues on it. Cc: Trent Piepho Cc: Hartmut Cc: Oliver Endriss Cc: Andreas Oberritter Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb/dvb-core/dmxdev.c | 16 +++++++--------- drivers/media/dvb/dvb-core/dvb_demux.c | 16 ++++++---------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c index 0c733c66a441..069d847ba887 100644 --- a/drivers/media/dvb/dvb-core/dmxdev.c +++ b/drivers/media/dvb/dvb-core/dmxdev.c @@ -364,16 +364,15 @@ static int dvb_dmxdev_section_callback(const u8 *buffer1, size_t buffer1_len, enum dmx_success success) { struct dmxdev_filter *dmxdevfilter = filter->priv; - unsigned long flags; int ret; if (dmxdevfilter->buffer.error) { wake_up(&dmxdevfilter->buffer.queue); return 0; } - spin_lock_irqsave(&dmxdevfilter->dev->lock, flags); + spin_lock(&dmxdevfilter->dev->lock); if (dmxdevfilter->state != DMXDEV_STATE_GO) { - spin_unlock_irqrestore(&dmxdevfilter->dev->lock, flags); + spin_unlock(&dmxdevfilter->dev->lock); return 0; } del_timer(&dmxdevfilter->timer); @@ -392,7 +391,7 @@ static int dvb_dmxdev_section_callback(const u8 *buffer1, size_t buffer1_len, } if (dmxdevfilter->params.sec.flags & DMX_ONESHOT) dmxdevfilter->state = DMXDEV_STATE_DONE; - spin_unlock_irqrestore(&dmxdevfilter->dev->lock, flags); + spin_unlock(&dmxdevfilter->dev->lock); wake_up(&dmxdevfilter->buffer.queue); return 0; } @@ -404,12 +403,11 @@ static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len, { struct dmxdev_filter *dmxdevfilter = feed->priv; struct dvb_ringbuffer *buffer; - unsigned long flags; int ret; - spin_lock_irqsave(&dmxdevfilter->dev->lock, flags); + spin_lock(&dmxdevfilter->dev->lock); if (dmxdevfilter->params.pes.output == DMX_OUT_DECODER) { - spin_unlock_irqrestore(&dmxdevfilter->dev->lock, flags); + spin_unlock(&dmxdevfilter->dev->lock); return 0; } @@ -419,7 +417,7 @@ static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len, else buffer = &dmxdevfilter->dev->dvr_buffer; if (buffer->error) { - spin_unlock_irqrestore(&dmxdevfilter->dev->lock, flags); + spin_unlock(&dmxdevfilter->dev->lock); wake_up(&buffer->queue); return 0; } @@ -430,7 +428,7 @@ static int dvb_dmxdev_ts_callback(const u8 *buffer1, size_t buffer1_len, dvb_ringbuffer_flush(buffer); buffer->error = ret; } - spin_unlock_irqrestore(&dmxdevfilter->dev->lock, flags); + spin_unlock(&dmxdevfilter->dev->lock); wake_up(&buffer->queue); return 0; } diff --git a/drivers/media/dvb/dvb-core/dvb_demux.c b/drivers/media/dvb/dvb-core/dvb_demux.c index a2c1fd5d2f67..e2eca0b1fe7c 100644 --- a/drivers/media/dvb/dvb-core/dvb_demux.c +++ b/drivers/media/dvb/dvb-core/dvb_demux.c @@ -399,9 +399,7 @@ static void dvb_dmx_swfilter_packet(struct dvb_demux *demux, const u8 *buf) void dvb_dmx_swfilter_packets(struct dvb_demux *demux, const u8 *buf, size_t count) { - unsigned long flags; - - spin_lock_irqsave(&demux->lock, flags); + spin_lock(&demux->lock); while (count--) { if (buf[0] == 0x47) @@ -409,17 +407,16 @@ void dvb_dmx_swfilter_packets(struct dvb_demux *demux, const u8 *buf, buf += 188; } - spin_unlock_irqrestore(&demux->lock, flags); + spin_unlock(&demux->lock); } EXPORT_SYMBOL(dvb_dmx_swfilter_packets); void dvb_dmx_swfilter(struct dvb_demux *demux, const u8 *buf, size_t count) { - unsigned long flags; int p = 0, i, j; - spin_lock_irqsave(&demux->lock, flags); + spin_lock(&demux->lock); if (demux->tsbufp) { i = demux->tsbufp; @@ -452,18 +449,17 @@ void dvb_dmx_swfilter(struct dvb_demux *demux, const u8 *buf, size_t count) } bailout: - spin_unlock_irqrestore(&demux->lock, flags); + spin_unlock(&demux->lock); } EXPORT_SYMBOL(dvb_dmx_swfilter); void dvb_dmx_swfilter_204(struct dvb_demux *demux, const u8 *buf, size_t count) { - unsigned long flags; int p = 0, i, j; u8 tmppack[188]; - spin_lock_irqsave(&demux->lock, flags); + spin_lock(&demux->lock); if (demux->tsbufp) { i = demux->tsbufp; @@ -504,7 +500,7 @@ void dvb_dmx_swfilter_204(struct dvb_demux *demux, const u8 *buf, size_t count) } bailout: - spin_unlock_irqrestore(&demux->lock, flags); + spin_unlock(&demux->lock); } EXPORT_SYMBOL(dvb_dmx_swfilter_204); From ad28127d7c7c617bca1d426f95b6ffa1fb8f700f Mon Sep 17 00:00:00 2001 From: Adam Baker Date: Wed, 4 Feb 2009 15:33:21 -0300 Subject: [PATCH 089/143] V4L/DVB (10619): gspca - main: Destroy the URBs at disconnection time. If a device using the gspca framework is unplugged while it is still streaming then the call that is used to free the URBs that have been allocated occurs after the pointer it uses becomes invalid at the end of gspca_disconnect. Make another cleanup call in gspca_disconnect while the pointer is still valid (multiple calls are OK as destroy_urbs checks for pointers already being NULL. Signed-off-by: Adam Baker Signed-off-by: Jean-Francois Moine Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/gspca/gspca.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c index 2ed24527ecd6..65e4901f4db7 100644 --- a/drivers/media/video/gspca/gspca.c +++ b/drivers/media/video/gspca/gspca.c @@ -422,6 +422,7 @@ static void destroy_urbs(struct gspca_dev *gspca_dev) if (urb == NULL) break; + BUG_ON(!gspca_dev->dev); gspca_dev->urb[i] = NULL; if (!gspca_dev->present) usb_kill_urb(urb); @@ -1950,8 +1951,12 @@ void gspca_disconnect(struct usb_interface *intf) { struct gspca_dev *gspca_dev = usb_get_intfdata(intf); + mutex_lock(&gspca_dev->usb_lock); gspca_dev->present = 0; + mutex_unlock(&gspca_dev->usb_lock); + destroy_urbs(gspca_dev); + gspca_dev->dev = NULL; usb_set_intfdata(intf, NULL); /* release the device */ From ac9575f75c52bcb455120f8c43376b556acba048 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 14 Feb 2009 19:58:33 -0300 Subject: [PATCH 090/143] V4L/DVB (10625): ivtv: fix decoder crash regression The video_ioctl2 conversion of ivtv in kernel 2.6.27 introduced a bug causing decoder commands to crash. The decoder commands should have been handled from the video_ioctl2 default handler, ensuring correct mapping of the argument between user and kernel space. Unfortunately they ended up before the video_ioctl2 call, causing random crashes. Thanks to hannes@linus.priv.at for testing and helping me track down the cause! Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/ivtv/ivtv-ioctl.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/media/video/ivtv/ivtv-ioctl.c b/drivers/media/video/ivtv/ivtv-ioctl.c index f6b3ef6e691b..9be6244573e9 100644 --- a/drivers/media/video/ivtv/ivtv-ioctl.c +++ b/drivers/media/video/ivtv/ivtv-ioctl.c @@ -1748,6 +1748,18 @@ static long ivtv_default(struct file *file, void *fh, int cmd, void *arg) break; } + case IVTV_IOC_DMA_FRAME: + case VIDEO_GET_PTS: + case VIDEO_GET_FRAME_COUNT: + case VIDEO_GET_EVENT: + case VIDEO_PLAY: + case VIDEO_STOP: + case VIDEO_FREEZE: + case VIDEO_CONTINUE: + case VIDEO_COMMAND: + case VIDEO_TRY_COMMAND: + return ivtv_decoder_ioctls(file, cmd, (void *)arg); + default: return -EINVAL; } @@ -1790,18 +1802,6 @@ static long ivtv_serialized_ioctl(struct ivtv *itv, struct file *filp, ivtv_vapi(itv, CX2341X_DEC_SET_AUDIO_MODE, 2, itv->audio_bilingual_mode, itv->audio_stereo_mode); return 0; - case IVTV_IOC_DMA_FRAME: - case VIDEO_GET_PTS: - case VIDEO_GET_FRAME_COUNT: - case VIDEO_GET_EVENT: - case VIDEO_PLAY: - case VIDEO_STOP: - case VIDEO_FREEZE: - case VIDEO_CONTINUE: - case VIDEO_COMMAND: - case VIDEO_TRY_COMMAND: - return ivtv_decoder_ioctls(filp, cmd, (void *)arg); - default: break; } From 7bf432d64c47f77111fbce5d48d7774df8b48948 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 16 Feb 2009 04:25:32 -0300 Subject: [PATCH 091/143] V4L/DVB (10626): ivtv: fix regression in get sliced vbi format The new v4l2_subdev_call used s_fmt instead of g_fmt. Thanks-to: Andy Walls Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/ivtv/ivtv-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/ivtv/ivtv-ioctl.c b/drivers/media/video/ivtv/ivtv-ioctl.c index 9be6244573e9..c13bd2aa0bea 100644 --- a/drivers/media/video/ivtv/ivtv-ioctl.c +++ b/drivers/media/video/ivtv/ivtv-ioctl.c @@ -393,7 +393,7 @@ static int ivtv_g_fmt_sliced_vbi_cap(struct file *file, void *fh, struct v4l2_fo return 0; } - v4l2_subdev_call(itv->sd_video, video, s_fmt, fmt); + v4l2_subdev_call(itv->sd_video, video, g_fmt, fmt); vbifmt->service_set = ivtv_get_service_set(vbifmt); return 0; } From 603eaa1bdd3e0402085e815cc531bb0a32827a9e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 17 Feb 2009 19:59:54 +0100 Subject: [PATCH 092/143] hwmon: (f71882fg) Hide misleading error message If the F71882FG chip is at address 0x4e, then the probe at 0x2e will fail with the following message in the logs: f71882fg: Not a Fintek device This is misleading because there is a Fintek device, just at a different address. So I propose to degrade this message to a debug message. Signed-off-by: Jean Delvare Acked-by: Hans de Goede --- drivers/hwmon/f71882fg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/f71882fg.c b/drivers/hwmon/f71882fg.c index 609cafff86bc..367f6cb1166c 100644 --- a/drivers/hwmon/f71882fg.c +++ b/drivers/hwmon/f71882fg.c @@ -1872,7 +1872,7 @@ static int __init f71882fg_find(int sioaddr, unsigned short *address, devid = superio_inw(sioaddr, SIO_REG_MANID); if (devid != SIO_FINTEK_ID) { - printk(KERN_INFO DRVNAME ": Not a Fintek device\n"); + pr_debug(DRVNAME ": Not a Fintek device\n"); goto exit; } From 18632f84fac770125c0982dfadec6b551e82144e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 17 Feb 2009 19:59:54 +0100 Subject: [PATCH 093/143] hwmon: Fix ACPI resource check error handling This patch fixes a number of cases where things were not properly cleaned up when acpi_check_resource_conflict() returned an error, causing oopses such as the one reported here: https://bugzilla.redhat.com/show_bug.cgi?id=483208 Signed-off-by: Hans de Goede Signed-off-by: Jean Delvare --- drivers/hwmon/f71882fg.c | 2 +- drivers/hwmon/vt1211.c | 2 +- drivers/hwmon/w83627ehf.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/f71882fg.c b/drivers/hwmon/f71882fg.c index 367f6cb1166c..5f81ddf71508 100644 --- a/drivers/hwmon/f71882fg.c +++ b/drivers/hwmon/f71882fg.c @@ -1932,7 +1932,7 @@ static int __init f71882fg_device_add(unsigned short address, res.name = f71882fg_pdev->name; err = acpi_check_resource_conflict(&res); if (err) - return err; + goto exit_device_put; err = platform_device_add_resources(f71882fg_pdev, &res, 1); if (err) { diff --git a/drivers/hwmon/vt1211.c b/drivers/hwmon/vt1211.c index b0ce37852281..73f77a9b8b18 100644 --- a/drivers/hwmon/vt1211.c +++ b/drivers/hwmon/vt1211.c @@ -1262,7 +1262,7 @@ static int __init vt1211_device_add(unsigned short address) res.name = pdev->name; err = acpi_check_resource_conflict(&res); if (err) - goto EXIT; + goto EXIT_DEV_PUT; err = platform_device_add_resources(pdev, &res, 1); if (err) { diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c index cb808d015361..feae743ba991 100644 --- a/drivers/hwmon/w83627ehf.c +++ b/drivers/hwmon/w83627ehf.c @@ -1548,7 +1548,7 @@ static int __init sensors_w83627ehf_init(void) err = acpi_check_resource_conflict(&res); if (err) - goto exit; + goto exit_device_put; err = platform_device_add_resources(pdev, &res, 1); if (err) { From 211e3e0e1b116b10ba707db2fec972909fa647fd Mon Sep 17 00:00:00 2001 From: Frank Seidel Date: Tue, 17 Feb 2009 19:59:54 +0100 Subject: [PATCH 094/143] MAINTAINERS: Switch hdaps to Frank Seidel As Rovert Love doesn't any more seem to be realy active on hdaps driver i'll happily take it over. Signed-off-by: Frank Seidel Cc: Robert Love Signed-off-by: Jean Delvare --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index db65b4e6d132..a654c5b9c271 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1905,10 +1905,10 @@ W: http://gigaset307x.sourceforge.net/ S: Maintained HARD DRIVE ACTIVE PROTECTION SYSTEM (HDAPS) DRIVER -P: Robert Love -M: rlove@rlove.org -M: linux-kernel@vger.kernel.org -W: http://www.kernel.org/pub/linux/kernel/people/rml/hdaps/ +P: Frank Seidel +M: frank@f-seidel.de +L: lm-sensors@lm-sensors.org +W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ S: Maintained GSPCA FINEPIX SUBDRIVER From 1a88b5364b535edaa321d70a566e358390ff0872 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 16 Feb 2009 02:38:12 +0000 Subject: [PATCH 095/143] Fix incomplete __mntput locking Getting this wrong caused WARNING: at fs/namespace.c:636 mntput_no_expire+0xac/0xf2() due to optimistically checking cpu_writer->mnt outside the spinlock. Here's what we really want: * we know that nobody will set cpu_writer->mnt to mnt from now on * all changes to that sucker are done under cpu_writer->lock * we want the laziest equivalent of spin_lock(&cpu_writer->lock); if (likely(cpu_writer->mnt != mnt)) { spin_unlock(&cpu_writer->lock); continue; } /* do stuff */ that would make sure we won't miss earlier setting of ->mnt done by another CPU. Anyway, for now we just move the spin_lock() earlier and move the test into the properly locked region. Signed-off-by: Al Viro Reported-and-tested-by: Li Zefan Signed-off-by: Linus Torvalds --- fs/namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 228d8c4bfd18..06f8e63f6cb1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt) */ for_each_possible_cpu(cpu) { struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - if (cpu_writer->mnt != mnt) - continue; spin_lock(&cpu_writer->lock); + if (cpu_writer->mnt != mnt) { + spin_unlock(&cpu_writer->lock); + continue; + } atomic_add(cpu_writer->count, &mnt->__mnt_writers); cpu_writer->count = 0; /* From ca77fde8e62cecb2c0769052228d15b901367af8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 13 Feb 2009 23:18:03 +0000 Subject: [PATCH 096/143] Fix Intel IOMMU write-buffer flushing This is the cause of the DMA faults and disk corruption that people have been seeing. Some chipsets neglect to report the RWBF "capability" -- the flag which says that we need to flush the chipset write-buffer when changing the DMA page tables, to ensure that the change is visible to the IOMMU. Override that bit on the affected chipsets, and everything is happy again. Thanks to Chris and Bhavesh and others for helping to debug. Signed-off-by: David Woodhouse Tested-by: Chris Wright Reviewed-by: Bhavesh Davda Signed-off-by: Linus Torvalds --- drivers/pci/intel-iommu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f4b7c79023ff..fa9e41626bfc 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -61,6 +61,8 @@ /* global iommu list, set NULL for ignored DMAR units */ static struct intel_iommu **g_iommus; +static int rwbf_quirk = 0; + /* * 0: Present * 1-11: Reserved @@ -785,7 +787,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) u32 val; unsigned long flag; - if (!cap_rwbf(iommu->cap)) + if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; val = iommu->gcmd | DMA_GCMD_WBF; @@ -3137,3 +3139,13 @@ static struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap_range, .iova_to_phys = intel_iommu_iova_to_phys, }; + +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) +{ + /* Mobile 4 Series Chipset neglects to set RWBF capability, + but needs it */ + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n"); + rwbf_quirk = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); From 2f6097129af5625db2fb5e601f89e5bf55cd1dcd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 16 Feb 2009 12:49:16 +0000 Subject: [PATCH 097/143] FRV: __pte_to_swp_entry doesn't expand correctly The macro doesn't expand correctly when its parameter isn't 'pte'. Signed-off-by: Roel Kluin Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/asm-frv/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 83c51aba534b..e16fdb1f4f4f 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -478,7 +478,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_type(x) (((x).val >> 2) & 0x1f) #define __swp_offset(x) ((x).val >> 8) #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 8) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte }) +#define __pte_to_swp_entry(_pte) ((swp_entry_t) { (_pte).pte }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) static inline int pte_file(pte_t pte) From 3494252d5644993f407a45f01c3e8ad5ae38f93c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Feb 2009 23:41:12 +0100 Subject: [PATCH 098/143] USB/PCI: Fix resume breakage of controllers behind cardbus bridges If a USB PCI controller is behind a cardbus bridge, we are trying to restore its configuration registers too early, before the cardbus bridge is operational. To fix this, call pci_restore_state() from usb_hcd_pci_resume() and remove usb_hcd_pci_resume_early() which is no longer necessary (the configuration spaces of USB controllers that are not behind cardbus bridges will be restored by the PCI PM core with interrupts disabled anyway). This patch fixes the regression from 2.6.28 tracked as http://bugzilla.kernel.org/show_bug.cgi?id=12659 [ Side note: the proper long-term fix is probably to just force the unplug event at suspend time instead of doing a plug/unplug at resume time, but this patch is fine regardless - Linus ] Signed-off-by: Rafael J. Wysocki Reported-by: Miles Lane Signed-off-by: Linus Torvalds --- drivers/usb/core/hcd-pci.c | 15 ++------------- drivers/usb/core/hcd.h | 1 - drivers/usb/host/ehci-pci.c | 1 - drivers/usb/host/ohci-pci.c | 1 - drivers/usb/host/uhci-hcd.c | 1 - 5 files changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index c54fc40458b1..a4301dc02d27 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -297,19 +297,6 @@ int usb_hcd_pci_suspend(struct pci_dev *dev, pm_message_t message) } EXPORT_SYMBOL_GPL(usb_hcd_pci_suspend); -/** - * usb_hcd_pci_resume_early - resume a PCI-based HCD before IRQs are enabled - * @dev: USB Host Controller being resumed - * - * Store this function in the HCD's struct pci_driver as .resume_early. - */ -int usb_hcd_pci_resume_early(struct pci_dev *dev) -{ - pci_restore_state(dev); - return 0; -} -EXPORT_SYMBOL_GPL(usb_hcd_pci_resume_early); - /** * usb_hcd_pci_resume - power management resume of a PCI-based HCD * @dev: USB Host Controller being resumed @@ -333,6 +320,8 @@ int usb_hcd_pci_resume(struct pci_dev *dev) } #endif + pci_restore_state(dev); + hcd = pci_get_drvdata(dev); if (hcd->state != HC_STATE_SUSPENDED) { dev_dbg(hcd->self.controller, diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h index 5b94a56bec23..f750eb1ab595 100644 --- a/drivers/usb/core/hcd.h +++ b/drivers/usb/core/hcd.h @@ -257,7 +257,6 @@ extern void usb_hcd_pci_remove(struct pci_dev *dev); #ifdef CONFIG_PM extern int usb_hcd_pci_suspend(struct pci_dev *dev, pm_message_t msg); -extern int usb_hcd_pci_resume_early(struct pci_dev *dev); extern int usb_hcd_pci_resume(struct pci_dev *dev); #endif /* CONFIG_PM */ diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c index bb21fb0a4969..abb9a7706ec7 100644 --- a/drivers/usb/host/ehci-pci.c +++ b/drivers/usb/host/ehci-pci.c @@ -432,7 +432,6 @@ static struct pci_driver ehci_pci_driver = { #ifdef CONFIG_PM .suspend = usb_hcd_pci_suspend, - .resume_early = usb_hcd_pci_resume_early, .resume = usb_hcd_pci_resume, #endif .shutdown = usb_hcd_pci_shutdown, diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c index 5d625c3fd423..f9961b4c0da3 100644 --- a/drivers/usb/host/ohci-pci.c +++ b/drivers/usb/host/ohci-pci.c @@ -487,7 +487,6 @@ static struct pci_driver ohci_pci_driver = { #ifdef CONFIG_PM .suspend = usb_hcd_pci_suspend, - .resume_early = usb_hcd_pci_resume_early, .resume = usb_hcd_pci_resume, #endif diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c index 944f7e0ca4df..cf5e4cf7ea42 100644 --- a/drivers/usb/host/uhci-hcd.c +++ b/drivers/usb/host/uhci-hcd.c @@ -942,7 +942,6 @@ static struct pci_driver uhci_pci_driver = { #ifdef CONFIG_PM .suspend = usb_hcd_pci_suspend, - .resume_early = usb_hcd_pci_resume_early, .resume = usb_hcd_pci_resume, #endif /* PM */ }; From 5955c7a2cfb6a35429adea5dc480002b15ca8cfc Mon Sep 17 00:00:00 2001 From: Zlatko Calusic Date: Wed, 18 Feb 2009 01:33:34 +0100 Subject: [PATCH 099/143] Add support for VT6415 PCIE PATA IDE Host Controller Signed-off-by: Zlatko Calusic Signed-off-by: Linus Torvalds --- drivers/ata/pata_via.c | 4 +++- include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index 79a6c9a0b721..ba556d3e6963 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -110,7 +110,8 @@ static const struct via_isa_bridge { { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA }, - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES}, + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES }, { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST }, @@ -593,6 +594,7 @@ static int via_reinit_one(struct pci_dev *pdev) #endif static const struct pci_device_id via[] = { + { PCI_VDEVICE(VIA, 0x0415), }, { PCI_VDEVICE(VIA, 0x0571), }, { PCI_VDEVICE(VIA, 0x0581), }, { PCI_VDEVICE(VIA, 0x1571), }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 52a9fe08451c..918391b4b109 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1312,6 +1312,7 @@ #define PCI_DEVICE_ID_VIA_VT3351 0x0351 #define PCI_DEVICE_ID_VIA_VT3364 0x0364 #define PCI_DEVICE_ID_VIA_8371_0 0x0391 +#define PCI_DEVICE_ID_VIA_6415 0x0415 #define PCI_DEVICE_ID_VIA_8501_0 0x0501 #define PCI_DEVICE_ID_VIA_82C561 0x0561 #define PCI_DEVICE_ID_VIA_82C586_1 0x0571 From 444122fd58fdc83c96877a92b3f6288cafddb08d Mon Sep 17 00:00:00 2001 From: Yi Li Date: Thu, 5 Feb 2009 15:31:57 +0800 Subject: [PATCH 100/143] MMC: fix bug - SDHC card capacity not correct Signed-off-by: Yi Li Signed-off-by: Bryan Wu Signed-off-by: Pierre Ossman --- drivers/mmc/card/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 45b1f430685f..513eb09a638f 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -584,7 +584,7 @@ static int mmc_blk_probe(struct mmc_card *card) if (err) goto out; - string_get_size(get_capacity(md->disk) << 9, STRING_UNITS_2, + string_get_size((u64)get_capacity(md->disk) << 9, STRING_UNITS_2, cap_str, sizeof(cap_str)); printk(KERN_INFO "%s: %s %s %s %s\n", md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), From 86a6a8749d5b8fd5c2b544fe9fd11101e3d0550f Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Mon, 2 Feb 2009 21:13:49 +0100 Subject: [PATCH 101/143] Revert "sdhci: force high speed capability on some controllers" This reverts commit a4b76193774b463b922cab2f92450efb20d29ef0. It turned out that the controller had problem running at the higher speed, so go back to trusting the hardware capability bits. Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci-pci.c | 3 +-- drivers/mmc/host/sdhci.c | 3 +-- drivers/mmc/host/sdhci.h | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c index f07255cb17ee..8cff5f5e7f86 100644 --- a/drivers/mmc/host/sdhci-pci.c +++ b/drivers/mmc/host/sdhci-pci.c @@ -144,8 +144,7 @@ static int jmicron_probe(struct sdhci_pci_chip *chip) SDHCI_QUIRK_32BIT_DMA_SIZE | SDHCI_QUIRK_32BIT_ADMA_SIZE | SDHCI_QUIRK_RESET_AFTER_REQUEST | - SDHCI_QUIRK_BROKEN_SMALL_PIO | - SDHCI_QUIRK_FORCE_HIGHSPEED; + SDHCI_QUIRK_BROKEN_SMALL_PIO; } /* diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 6b2d1f99af67..24d7414a3315 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1636,8 +1636,7 @@ int sdhci_add_host(struct sdhci_host *host) mmc->f_max = host->max_clk; mmc->caps = MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ; - if ((caps & SDHCI_CAN_DO_HISPD) || - (host->quirks & SDHCI_QUIRK_FORCE_HIGHSPEED)) + if (caps & SDHCI_CAN_DO_HISPD) mmc->caps |= MMC_CAP_SD_HIGHSPEED; mmc->ocr_avail = 0; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 3efba2363941..ffeab227d95b 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -208,8 +208,6 @@ struct sdhci_host { #define SDHCI_QUIRK_BROKEN_TIMEOUT_VAL (1<<12) /* Controller has an issue with buffer bits for small transfers */ #define SDHCI_QUIRK_BROKEN_SMALL_PIO (1<<13) -/* Controller supports high speed but doesn't have the caps bit set */ -#define SDHCI_QUIRK_FORCE_HIGHSPEED (1<<14) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ From c1c201200a359cf3b6e2e36a4236cdca77a3cd8e Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 3 Feb 2009 07:47:29 +0100 Subject: [PATCH 102/143] bsg: Fix sense buffer bug in SG_IO When submitting requests via SG_IO, which does a sync io, a bsg_command is not allocated. So an in-Kernel sense_buffer was not set. However when calling blk_execute_rq() with no sense buffer one is provided from the stack. Now bsg at blk_complete_sgv4_hdr_rq() would check if rq->sense_len and a sense was requested by sg_io_v4 the rq->sense was copy_user() back, but by now it is already mangled stack memory. I have fixed that by forcing a sense_buffer when calling bsg_map_hdr(). The bsg_command->sense is provided in the write/read path like before, and on-the-stack buffer is provided when doing SG_IO. I have also fixed a dprintk message to print rq->errors in hex because of the scsi bit-field use of this member. For other block devices it does not matter anyway. Signed-off-by: Boaz Harrosh Acked-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/bsg.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index d414bb5607e8..0ce8806dd0c1 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -244,7 +244,8 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) * map sg_io_v4 to a request. */ static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) +bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, + u8 *sense) { struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; @@ -306,6 +307,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) if (ret) goto out; } + + rq->sense = sense; + rq->sense_len = 0; + return rq; out: if (rq->cmd != rq->__cmd) @@ -348,9 +353,6 @@ static void bsg_rq_end_io(struct request *rq, int uptodate) static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, struct bsg_command *bc, struct request *rq) { - rq->sense = bc->sense; - rq->sense_len = 0; - /* * add bc command to busy queue and submit rq for io */ @@ -419,7 +421,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, { int ret = 0; - dprintk("rq %p bio %p %u\n", rq, bio, rq->errors); + dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors); /* * fill in all the output members */ @@ -635,7 +637,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); + rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm, bc->sense); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -922,11 +924,12 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct request *rq; struct bio *bio, *bidi_bio = NULL; struct sg_io_v4 hdr; + u8 sense[SCSI_SENSE_BUFFERSIZE]; if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); + rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense); if (IS_ERR(rq)) return PTR_ERR(rq); From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Feb 2009 10:25:40 +0100 Subject: [PATCH 103/143] block: fix bad definition of BIO_RW_SYNC We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before 213d9417fec62ef4c3675621b9364a667954d4dd. Signed-off-by: Jens Axboe --- block/blktrace.c | 2 +- drivers/md/dm-io.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/md.c | 4 ++-- include/linux/bio.h | 2 -- include/linux/blktrace_api.h | 1 + include/linux/fs.h | 6 +++--- kernel/power/swap.c | 5 +++-- mm/page_io.c | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 39cc3bfe56e4..7cf9d1ff45a0 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -142,7 +142,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= ddir_act[rw & WRITE]; what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, SYNCIO); what |= MASK_TC_BIT(rw, AHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index a34338567a2a..f14813be4eff 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions, struct dpages old_pages = *dp; if (sync) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); /* * For multiple regions we need to be careful to rewind diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3073618269ea..0a225da21272 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNC), + .bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG), .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, diff --git a/drivers/md/md.c b/drivers/md/md.c index 4495104f6c9f..03b4cd0a6344 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<bi_bdev = rdev->bdev; bio->bi_sector = sector; @@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct completion event; int ret; - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); bio->bi_bdev = bdev; bio->bi_sector = sector; diff --git a/include/linux/bio.h b/include/linux/bio.h index 2aa283ab062b..1b16108a5417 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -171,8 +171,6 @@ struct bio { #define BIO_RW_FAILFAST_TRANSPORT 8 #define BIO_RW_FAILFAST_DRIVER 9 -#define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) - #define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 25379cba2370..6e915878e88c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -15,6 +15,7 @@ enum blktrace_cat { BLK_TC_WRITE = 1 << 1, /* writes */ BLK_TC_BARRIER = 1 << 2, /* barrier */ BLK_TC_SYNC = 1 << 3, /* sync IO */ + BLK_TC_SYNCIO = BLK_TC_SYNC, BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ BLK_TC_REQUEUE = 1 << 5, /* requeueing */ BLK_TC_ISSUE = 1 << 6, /* issue */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 6022f44043f2..67857dc16365 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -87,10 +87,10 @@ struct inodes_stat_t { #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ -#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) +#define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define READ_META (READ | (1 << BIO_RW_META)) -#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) -#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC)) +#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) +#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 6da14358537c..505f319e489c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -60,6 +60,7 @@ static struct block_device *resume_bdev; static int submit(int rw, pgoff_t page_off, struct page *page, struct bio **bio_chain) { + const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); @@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, bio_get(bio); if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + submit_bio(bio_rw, bio); wait_on_page_locked(page); if (rw == READ) bio_set_pages_dirty(bio); @@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page, get_page(page); /* These pages are freed later */ bio->bi_private = *bio_chain; *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); + submit_bio(bio_rw, bio); } return 0; } diff --git a/mm/page_io.c b/mm/page_io.c index dc6ce0afbded..3023c475e041 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); From a60e78e57a17d55bbd5a96da16fe9649d364b987 Mon Sep 17 00:00:00 2001 From: Subhash Peddamallu Date: Mon, 16 Feb 2009 10:27:07 +0100 Subject: [PATCH 104/143] fs/bio: bio_alloc_bioset: pass right object ptr to mempool_free When freeing from bio pool use right ptr to account for bs->front_pad, instead of bio ptr, Signed-off-by: Subhash Peddamallu Signed-off-by: Jens Axboe --- fs/bio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index 062299acbccd..72ab251cdb9c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -302,9 +302,10 @@ void bio_init(struct bio *bio) struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { struct bio *bio = NULL; + void *p; if (bs) { - void *p = mempool_alloc(bs->bio_pool, gfp_mask); + p = mempool_alloc(bs->bio_pool, gfp_mask); if (p) bio = p + bs->front_pad; @@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) } if (unlikely(!bvl)) { if (bs) - mempool_free(bio, bs->bio_pool); + mempool_free(p, bs->bio_pool); else kfree(bio); bio = NULL; From c8cbec6bdf6329279fd14696020f6b59d1d3124d Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 16 Feb 2009 13:11:55 +0100 Subject: [PATCH 105/143] paride/pg.c: xs(): &&/|| confusion &&/|| confusion Signed-off-by: Roel Kluin Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/paride/pg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 9dfa27163001..c397b3ddba9b 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -422,7 +422,7 @@ static void xs(char *buf, char *targ, int len) for (k = 0; k < len; k++) { char c = *buf++; - if (c != ' ' || c != l) + if (c != ' ' && c != l) l = *targ++ = c; } if (l == ' ') From 82eb03cfd862a65363fa2826de0dbd5474cfe5e2 Mon Sep 17 00:00:00 2001 From: Chip Coldwell Date: Mon, 16 Feb 2009 13:11:56 +0100 Subject: [PATCH 106/143] cciss: PCI power management reset for kexec The kexec kernel resets the CCISS hardware in three steps: 1. Use PCI power management states to reset the controller in the kexec kernel. 2. Clear the MSI/MSI-X bits in PCI configuration space so that MSI initialization in the kexec kernel doesn't fail. 3. Use the CCISS "No-op" message to determine when the controller firmware has recovered from the PCI PM reset. [akpm@linux-foundation.org: cleanups] Signed-off-by: Mike Miller Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 215 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 01e69383d9c0..d2cb67b61176 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3390,6 +3390,203 @@ static void free_hba(int i) kfree(p); } +/* Send a message CDB to the firmware. */ +static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, unsigned char type) +{ + typedef struct { + CommandListHeader_struct CommandHeader; + RequestBlock_struct Request; + ErrDescriptor_struct ErrorDescriptor; + } Command; + static const size_t cmd_sz = sizeof(Command) + sizeof(ErrorInfo_struct); + Command *cmd; + dma_addr_t paddr64; + uint32_t paddr32, tag; + void __iomem *vaddr; + int i, err; + + vaddr = ioremap_nocache(pci_resource_start(pdev, 0), pci_resource_len(pdev, 0)); + if (vaddr == NULL) + return -ENOMEM; + + /* The Inbound Post Queue only accepts 32-bit physical addresses for the + CCISS commands, so they must be allocated from the lower 4GiB of + memory. */ + err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); + if (err) { + iounmap(vaddr); + return -ENOMEM; + } + + cmd = pci_alloc_consistent(pdev, cmd_sz, &paddr64); + if (cmd == NULL) { + iounmap(vaddr); + return -ENOMEM; + } + + /* This must fit, because of the 32-bit consistent DMA mask. Also, + although there's no guarantee, we assume that the address is at + least 4-byte aligned (most likely, it's page-aligned). */ + paddr32 = paddr64; + + cmd->CommandHeader.ReplyQueue = 0; + cmd->CommandHeader.SGList = 0; + cmd->CommandHeader.SGTotal = 0; + cmd->CommandHeader.Tag.lower = paddr32; + cmd->CommandHeader.Tag.upper = 0; + memset(&cmd->CommandHeader.LUN.LunAddrBytes, 0, 8); + + cmd->Request.CDBLen = 16; + cmd->Request.Type.Type = TYPE_MSG; + cmd->Request.Type.Attribute = ATTR_HEADOFQUEUE; + cmd->Request.Type.Direction = XFER_NONE; + cmd->Request.Timeout = 0; /* Don't time out */ + cmd->Request.CDB[0] = opcode; + cmd->Request.CDB[1] = type; + memset(&cmd->Request.CDB[2], 0, 14); /* the rest of the CDB is reserved */ + + cmd->ErrorDescriptor.Addr.lower = paddr32 + sizeof(Command); + cmd->ErrorDescriptor.Addr.upper = 0; + cmd->ErrorDescriptor.Len = sizeof(ErrorInfo_struct); + + writel(paddr32, vaddr + SA5_REQUEST_PORT_OFFSET); + + for (i = 0; i < 10; i++) { + tag = readl(vaddr + SA5_REPLY_PORT_OFFSET); + if ((tag & ~3) == paddr32) + break; + schedule_timeout_uninterruptible(HZ); + } + + iounmap(vaddr); + + /* we leak the DMA buffer here ... no choice since the controller could + still complete the command. */ + if (i == 10) { + printk(KERN_ERR "cciss: controller message %02x:%02x timed out\n", + opcode, type); + return -ETIMEDOUT; + } + + pci_free_consistent(pdev, cmd_sz, cmd, paddr64); + + if (tag & 2) { + printk(KERN_ERR "cciss: controller message %02x:%02x failed\n", + opcode, type); + return -EIO; + } + + printk(KERN_INFO "cciss: controller message %02x:%02x succeeded\n", + opcode, type); + return 0; +} + +#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0) +#define cciss_noop(p) cciss_message(p, 3, 0) + +static __devinit int cciss_reset_msi(struct pci_dev *pdev) +{ +/* the #defines are stolen from drivers/pci/msi.h. */ +#define msi_control_reg(base) (base + PCI_MSI_FLAGS) +#define PCI_MSIX_FLAGS_ENABLE (1 << 15) + + int pos; + u16 control = 0; + + pos = pci_find_capability(pdev, PCI_CAP_ID_MSI); + if (pos) { + pci_read_config_word(pdev, msi_control_reg(pos), &control); + if (control & PCI_MSI_FLAGS_ENABLE) { + printk(KERN_INFO "cciss: resetting MSI\n"); + pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE); + } + } + + pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); + if (pos) { + pci_read_config_word(pdev, msi_control_reg(pos), &control); + if (control & PCI_MSIX_FLAGS_ENABLE) { + printk(KERN_INFO "cciss: resetting MSI-X\n"); + pci_write_config_word(pdev, msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE); + } + } + + return 0; +} + +/* This does a hard reset of the controller using PCI power management + * states. */ +static __devinit int cciss_hard_reset_controller(struct pci_dev *pdev) +{ + u16 pmcsr, saved_config_space[32]; + int i, pos; + + printk(KERN_INFO "cciss: using PCI PM to reset controller\n"); + + /* This is very nearly the same thing as + + pci_save_state(pci_dev); + pci_set_power_state(pci_dev, PCI_D3hot); + pci_set_power_state(pci_dev, PCI_D0); + pci_restore_state(pci_dev); + + but we can't use these nice canned kernel routines on + kexec, because they also check the MSI/MSI-X state in PCI + configuration space and do the wrong thing when it is + set/cleared. Also, the pci_save/restore_state functions + violate the ordering requirements for restoring the + configuration space from the CCISS document (see the + comment below). So we roll our own .... */ + + for (i = 0; i < 32; i++) + pci_read_config_word(pdev, 2*i, &saved_config_space[i]); + + pos = pci_find_capability(pdev, PCI_CAP_ID_PM); + if (pos == 0) { + printk(KERN_ERR "cciss_reset_controller: PCI PM not supported\n"); + return -ENODEV; + } + + /* Quoting from the Open CISS Specification: "The Power + * Management Control/Status Register (CSR) controls the power + * state of the device. The normal operating state is D0, + * CSR=00h. The software off state is D3, CSR=03h. To reset + * the controller, place the interface device in D3 then to + * D0, this causes a secondary PCI reset which will reset the + * controller." */ + + /* enter the D3hot power management state */ + pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + pmcsr |= PCI_D3hot; + pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); + + schedule_timeout_uninterruptible(HZ >> 1); + + /* enter the D0 power management state */ + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + pmcsr |= PCI_D0; + pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); + + schedule_timeout_uninterruptible(HZ >> 1); + + /* Restore the PCI configuration space. The Open CISS + * Specification says, "Restore the PCI Configuration + * Registers, offsets 00h through 60h. It is important to + * restore the command register, 16-bits at offset 04h, + * last. Do not restore the configuration status register, + * 16-bits at offset 06h." Note that the offset is 2*i. */ + for (i = 0; i < 32; i++) { + if (i == 2 || i == 3) + continue; + pci_write_config_word(pdev, 2*i, saved_config_space[i]); + } + wmb(); + pci_write_config_word(pdev, 4, saved_config_space[2]); + + return 0; +} + /* * This is it. Find all the controllers and register them. I really hate * stealing all these major device numbers. @@ -3404,6 +3601,24 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, int dac, return_code; InquiryData_struct *inq_buff = NULL; + if (reset_devices) { + /* Reset the controller with a PCI power-cycle */ + if (cciss_hard_reset_controller(pdev) || cciss_reset_msi(pdev)) + return -ENODEV; + + /* Some devices (notably the HP Smart Array 5i Controller) + need a little pause here */ + schedule_timeout_uninterruptible(30*HZ); + + /* Now try to get the controller to respond to a no-op */ + for (i=0; i<12; i++) { + if (cciss_noop(pdev) == 0) + break; + else + printk("cciss: no-op failed%s\n", (i < 11 ? "; re-trying" : "")); + } + } + i = alloc_cciss_hba(); if (i < 0) return -1; From 78f707bfc723552e8309b7c38a8d0cc51012e813 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Feb 2009 13:59:08 +0100 Subject: [PATCH 107/143] block: revert part of 18ce3751ccd488c78d3827e9f6bf54e6322676fb The above commit added WRITE_SYNC and switched various places to using that for committing writes that will be waited upon immediately after submission. However, this causes a performance regression with AS and CFQ for ext3 at least, since sync_dirty_buffer() will submit some writes with WRITE_SYNC while ext3 has sumitted others dependent writes without the sync flag set. This causes excessive anticipation/idling in the IO scheduler because sync and async writes get interleaved, causing a big performance regression for the below test case (which is meant to simulate sqlite like behaviour). ---- test case ---- int main(int argc, char **argv) { int fdes, i; FILE *fp; struct timeval start; struct timeval end; struct timeval res; gettimeofday(&start, NULL); for (i=0; i --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 665d446b25bc..62b57e330b69 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3108,7 +3108,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); From 41b8c853a495438208faa5be03bbb0050859163b Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Wed, 18 Feb 2009 10:33:59 +0100 Subject: [PATCH 108/143] block: fix booting from partitioned md array Hi Tejun, it looks like your commit: block: don't depend on consecutive minor space f331c0296f2a9fee0d396a70598b954062603015 broke a particular case for booting from partitioned md/raid devices. That is the second time this has been broken recently. The previous time was fixed by block: do_mounts - accept root= 30f2f0eb4bd2c43d10a8b0d872c6e5ad8f31c9a0 Because the data isn't available when an md device is first created (we add disks and set it up after creation), the initial partition scan finds nothing. It is not until the device is opened that another partition scan happens and finds something. So at the point where the kernel parameter "root=/dev/md_d0p1" is being parsed, md_d0 exists, but md_d0p1 does not. However if we let blk_lookup_devt return the correct device number even though the device doesn't exist, then the attempt to mount it will successfully find the partition. I have tried in the past to find a way to get the partition table to be read as soon as the array is assembled but that proved impossible (at the time). I don't remember the details, and could possibly revisit it. However it would be really nice if blk_lookup_devt could be adjusted to again accept non existant partitions. Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index 397960cf26af..e1eadcc9546a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1087,6 +1087,14 @@ dev_t blk_lookup_devt(const char *name, int partno) if (strcmp(dev_name(dev), name)) continue; + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + break; + } part = disk_get_part(disk, partno); if (part) { devt = part_devt(part); From be987fdb55a4726e2fcbab7501f89276bdb57288 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 18 Feb 2009 10:30:15 +0100 Subject: [PATCH 109/143] block: fix deadlock in blk_abort_queue() for drivers that readd to timeout list blk_abort_queue() iterates the timeout list and aborts each request on the list, but if the driver error handling readds a request to the timeout list during this processing, we could be looping forever. Fix this by splicing current entries to a local list and run over that list instead. Signed-off-by: Jens Axboe --- block/blk-timeout.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a09535377a94..bbbdc4b8ccf2 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -209,12 +209,19 @@ void blk_abort_queue(struct request_queue *q) { unsigned long flags; struct request *rq, *tmp; + LIST_HEAD(list); spin_lock_irqsave(q->queue_lock, flags); elv_abort_queue(q); - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) + /* + * Splice entries to local list, to avoid deadlocking if entries + * get readded to the timeout list by error handling + */ + list_splice_init(&q->timeout_list, &list); + + list_for_each_entry_safe(rq, tmp, &list, timeout_list) blk_abort_request(rq); spin_unlock_irqrestore(q->queue_lock, flags); From 994244883739e4044bef76d4e5d7a9b66dc6c7b6 Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Wed, 11 Feb 2009 13:25:52 -0800 Subject: [PATCH 110/143] s3cmci: Fix hangup in do_pio_write() This commit fixes the regression what was added by commit 088a78af978d0c8e339071a9b2bca1f4cb368f30 "s3cmci: Support transfers which are not multiple of 32 bits." fifo_free() now returns amount of available space in FIFO buffer in bytes. But do_pio_write() writes to FIFO 32-bit words. Condition for return from cycle is (fifo_free() == 0), but when fifo has 1..3 bytes of free space then this condition will never be true and system hangs. This patch changes condition in the while() to (fifo_free() > 3). Signed-off-by: Yauhen Kharuzhy Signed-off-by: Andrew Morton Signed-off-by: Pierre Ossman --- drivers/mmc/host/s3cmci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index 35a98eec7414..f4a67c65d301 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -329,7 +329,7 @@ static void do_pio_write(struct s3cmci_host *host) to_ptr = host->base + host->sdidata; - while ((fifo = fifo_free(host))) { + while ((fifo = fifo_free(host)) > 3) { if (!host->pio_bytes) { res = get_data_buffer(host, &host->pio_bytes, &host->pio_ptr); From 58a5dd3e0e77029d3db1f8fa75d0b54b38169d5d Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Fri, 13 Feb 2009 22:55:26 +0530 Subject: [PATCH 111/143] mmc_test: fix basic read test Due to a typo in the Basic Read test, it's currently identical to the Basic Write test. Fix this. Signed-off-by: Rabin Vincent Signed-off-by: Pierre Ossman --- drivers/mmc/card/mmc_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/card/mmc_test.c b/drivers/mmc/card/mmc_test.c index b92b172074ee..b9f1e84897cc 100644 --- a/drivers/mmc/card/mmc_test.c +++ b/drivers/mmc/card/mmc_test.c @@ -494,7 +494,7 @@ static int mmc_test_basic_read(struct mmc_test_card *test) sg_init_one(&sg, test->buffer, 512); - ret = mmc_test_simple_transfer(test, &sg, 1, 0, 1, 512, 1); + ret = mmc_test_simple_transfer(test, &sg, 1, 0, 1, 512, 0); if (ret) return ret; From 5dbace0c9ba110c1a3810a89fa6bf12b7574b5a3 Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Sat, 14 Feb 2009 16:22:39 +0100 Subject: [PATCH 112/143] sdhci: fix led naming Fix the led device naming for the sdhci driver. The led class documentation defines the led name to have the form "devicename:colour:function" while not applicable sections should be left blank. To comply with the documentation the led device name is changed from "mmc*" to "mmc*::". Signed-off-by: Helmut Schaa Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 4 +++- drivers/mmc/host/sdhci.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 24d7414a3315..f52f3053ed92 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1722,7 +1722,9 @@ int sdhci_add_host(struct sdhci_host *host) #endif #ifdef SDHCI_USE_LEDS_CLASS - host->led.name = mmc_hostname(mmc); + snprintf(host->led_name, sizeof(host->led_name), + "%s::", mmc_hostname(mmc)); + host->led.name = host->led_name; host->led.brightness = LED_OFF; host->led.default_trigger = mmc_hostname(mmc); host->led.brightness_set = sdhci_led_control; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index ffeab227d95b..ebb83657e27a 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -220,6 +220,7 @@ struct sdhci_host { #if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE) struct led_classdev led; /* LED control */ + char led_name[32]; #endif spinlock_t lock; /* Mutex */ From 249d0fa9d59b6165ecc224720d9ce9b7267cf1b8 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Feb 2009 14:42:03 -0800 Subject: [PATCH 113/143] omap_hsmmc: card detect irq bugfix Work around lockdep issue when card detect IRQ handlers run in thread context ... it forces IRQF_DISABLED, which prevents all access to twl4030 card detect signals. Signed-off-by: David Brownell Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index db37490f67ec..4dba48642e60 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -517,6 +517,9 @@ static void mmc_omap_detect(struct work_struct *work) { struct mmc_omap_host *host = container_of(work, struct mmc_omap_host, mmc_carddetect_work); + struct omap_mmc_slot_data *slot = &mmc_slot(host); + + host->carddetect = slot->card_detect(slot->card_detect_irq); sysfs_notify(&host->mmc->class_dev.kobj, NULL, "cover_switch"); if (host->carddetect) { @@ -538,7 +541,6 @@ static irqreturn_t omap_mmc_cd_handler(int irq, void *dev_id) { struct mmc_omap_host *host = (struct mmc_omap_host *)dev_id; - host->carddetect = mmc_slot(host).card_detect(irq); schedule_work(&host->mmc_carddetect_work); return IRQ_HANDLED; From eb25082657be3e7639e349fc926afdcbb0a4dc65 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Tue, 17 Feb 2009 14:49:01 -0800 Subject: [PATCH 114/143] omap_hsmmc: only MMC1 allows HCTL.SDVS != 1.8V Based on a patch from Tony Lindgren ... after initialization, never change HCTL.SDVS except for MMC1. The other controller instances only support 1.8V in that field, although they can suport other card/SDIO/eMMC/... voltages with level shifting solutions such as external transceivers. MMC2 behavior sanity tested on Overo/WLAN, OMAP3430 SDP, and custom hardware. MMC1 also sanity tested on those platforms plus Beagle. This also fixes a bug preventing MMC2 (and also presumably MMC3) from powering down when requested. Signed-off-by: David Brownell Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 43 +++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 4dba48642e60..3bfd0facb794 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -55,6 +55,7 @@ #define VS30 (1 << 25) #define SDVS18 (0x5 << 9) #define SDVS30 (0x6 << 9) +#define SDVS33 (0x7 << 9) #define SDVSCLR 0xFFFFF1FF #define SDVSDET 0x00000400 #define AUTOIDLE 0x1 @@ -456,13 +457,20 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) } /* - * Switch MMC operating voltage + * Switch MMC interface voltage ... only relevant for MMC1. + * + * MMC2 and MMC3 use fixed 1.8V levels, and maybe a transceiver. + * The MMC2 transceiver controls are used instead of DAT4..DAT7. + * Some chips, like eMMC ones, use internal transceivers. */ static int omap_mmc_switch_opcond(struct mmc_omap_host *host, int vdd) { u32 reg_val = 0; int ret; + if (host->id != OMAP_MMC1_DEVID) + return 0; + /* Disable the clocks */ clk_disable(host->fclk); clk_disable(host->iclk); @@ -485,19 +493,26 @@ static int omap_mmc_switch_opcond(struct mmc_omap_host *host, int vdd) OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) & SDVSCLR); reg_val = OMAP_HSMMC_READ(host->base, HCTL); + /* * If a MMC dual voltage card is detected, the set_ios fn calls * this fn with VDD bit set for 1.8V. Upon card removal from the * slot, omap_mmc_set_ios sets the VDD back to 3V on MMC_POWER_OFF. * - * Only MMC1 supports 3.0V. MMC2 will not function if SDVS30 is - * set in HCTL. + * Cope with a bit of slop in the range ... per data sheets: + * - "1.8V" for vdds_mmc1/vdds_mmc1a can be up to 2.45V max, + * but recommended values are 1.71V to 1.89V + * - "3.0V" for vdds_mmc1/vdds_mmc1a can be up to 3.5V max, + * but recommended values are 2.7V to 3.3V + * + * Board setup code shouldn't permit anything very out-of-range. + * TWL4030-family VMMC1 and VSIM regulators are fine (avoiding the + * middle range) but VSIM can't power DAT4..DAT7 at more than 3V. */ - if (host->id == OMAP_MMC1_DEVID && (((1 << vdd) == MMC_VDD_32_33) || - ((1 << vdd) == MMC_VDD_33_34))) - reg_val |= SDVS30; - if ((1 << vdd) == MMC_VDD_165_195) + if ((1 << vdd) <= MMC_VDD_23_24) reg_val |= SDVS18; + else + reg_val |= SDVS30; OMAP_HSMMC_WRITE(host->base, HCTL, reg_val); @@ -759,10 +774,14 @@ static void omap_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) case MMC_POWER_OFF: mmc_slot(host).set_power(host->dev, host->slot_id, 0, 0); /* - * Reset bus voltage to 3V if it got set to 1.8V earlier. + * Reset interface voltage to 3V if it's 1.8V now; + * only relevant on MMC-1, the others always use 1.8V. + * * REVISIT: If we are able to detect cards after unplugging * a 1.8V card, this code should not be needed. */ + if (host->id != OMAP_MMC1_DEVID) + break; if (!(OMAP_HSMMC_READ(host->base, HCTL) & SDVSDET)) { int vdd = fls(host->mmc->ocr_avail) - 1; if (omap_mmc_switch_opcond(host, vdd) != 0) @@ -786,7 +805,9 @@ static void omap_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) } if (host->id == OMAP_MMC1_DEVID) { - /* Only MMC1 can operate at 3V/1.8V */ + /* Only MMC1 can interface at 3V without some flavor + * of external transceiver; but they all handle 1.8V. + */ if ((OMAP_HSMMC_READ(host->base, HCTL) & SDVSDET) && (ios->vdd == DUAL_VOLT_OCR_BIT)) { /* @@ -1139,7 +1160,9 @@ static int omap_mmc_suspend(struct platform_device *pdev, pm_message_t state) " level suspend\n"); } - if (!(OMAP_HSMMC_READ(host->base, HCTL) & SDVSDET)) { + if (host->id == OMAP_MMC1_DEVID + && !(OMAP_HSMMC_READ(host->base, HCTL) + & SDVSDET)) { OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) & SDVSCLR); From c232f457e409b34417166596ea3daf298ace95c9 Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Wed, 11 Feb 2009 13:11:39 -0800 Subject: [PATCH 115/143] omap_hsmmc: recover from transfer failures Timeouts during a command that has a data phase can result in the next command issued after the command that failed not being processed, i.e. no interrupt ever occurs to indicate the command has completed. This failure can result in a deadlock. This patch resets the data state machine to clear the error in case of a command timeout. Tested on OMAP3430 chip and intensive MMC/SD device removal while transferring data. Signed-off-by: Andy Lowe Signed-off-by: Jean Pihet Signed-off-by: Adrian Hunter Signed-off-by: Andrew Morton Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 3bfd0facb794..8d21a07f63de 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -417,8 +417,15 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) } end_cmd = 1; } - if (host->data) + if (host->data) { mmc_dma_cleanup(host); + OMAP_HSMMC_WRITE(host->base, SYSCTL, + OMAP_HSMMC_READ(host->base, + SYSCTL) | SRD); + while (OMAP_HSMMC_READ(host->base, + SYSCTL) & SRD) + ; + } } if ((status & DATA_TIMEOUT) || (status & DATA_CRC)) { From 3ebf74b1de9f94da4291db3ea1ae11c5bedb5784 Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Fri, 6 Feb 2009 16:42:51 +0100 Subject: [PATCH 116/143] omap_hsmmc: Change while(); loops with finite version Replace the infinite 'while() ;' loops with a finite loop version. Signed-off-by: Jean Pihet Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 54 +++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 8d21a07f63de..a631c81dce12 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -376,6 +376,32 @@ static void mmc_omap_report_irq(struct mmc_omap_host *host, u32 status) } #endif /* CONFIG_MMC_DEBUG */ +/* + * MMC controller internal state machines reset + * + * Used to reset command or data internal state machines, using respectively + * SRC or SRD bit of SYSCTL register + * Can be called from interrupt context + */ +static inline void mmc_omap_reset_controller_fsm(struct mmc_omap_host *host, + unsigned long bit) +{ + unsigned long i = 0; + unsigned long limit = (loops_per_jiffy * + msecs_to_jiffies(MMC_TIMEOUT_MS)); + + OMAP_HSMMC_WRITE(host->base, SYSCTL, + OMAP_HSMMC_READ(host->base, SYSCTL) | bit); + + while ((OMAP_HSMMC_READ(host->base, SYSCTL) & bit) && + (i++ < limit)) + cpu_relax(); + + if (OMAP_HSMMC_READ(host->base, SYSCTL) & bit) + dev_err(mmc_dev(host->mmc), + "Timeout waiting on controller reset in %s\n", + __func__); +} /* * MMC controller IRQ handler @@ -404,13 +430,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) (status & CMD_CRC)) { if (host->cmd) { if (status & CMD_TIMEOUT) { - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, - SYSCTL) | SRC); - while (OMAP_HSMMC_READ(host->base, - SYSCTL) & SRC) - ; - + mmc_omap_reset_controller_fsm(host, SRC); host->cmd->error = -ETIMEDOUT; } else { host->cmd->error = -EILSEQ; @@ -419,12 +439,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) } if (host->data) { mmc_dma_cleanup(host); - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, - SYSCTL) | SRD); - while (OMAP_HSMMC_READ(host->base, - SYSCTL) & SRD) - ; + mmc_omap_reset_controller_fsm(host, SRD); } } if ((status & DATA_TIMEOUT) || @@ -434,12 +449,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) mmc_dma_cleanup(host); else host->data->error = -EILSEQ; - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, - SYSCTL) | SRD); - while (OMAP_HSMMC_READ(host->base, - SYSCTL) & SRD) - ; + mmc_omap_reset_controller_fsm(host, SRD); end_trans = 1; } } @@ -547,11 +557,7 @@ static void mmc_omap_detect(struct work_struct *work) if (host->carddetect) { mmc_detect_change(host->mmc, (HZ * 200) / 1000); } else { - OMAP_HSMMC_WRITE(host->base, SYSCTL, - OMAP_HSMMC_READ(host->base, SYSCTL) | SRD); - while (OMAP_HSMMC_READ(host->base, SYSCTL) & SRD) - ; - + mmc_omap_reset_controller_fsm(host, SRD); mmc_detect_change(host->mmc, (HZ * 50) / 1000); } } From c296861291669f305deef19b78042330d7135017 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 18 Feb 2009 14:48:12 -0800 Subject: [PATCH 117/143] vmalloc: add __get_vm_area_caller() We have get_vm_area_caller() and __get_vm_area() but not __get_vm_area_caller() On powerpc, I use __get_vm_area() to separate the ranges of addresses given to vmalloc vs. ioremap (various good reasons for that) so in order to be able to implement the new caller tracking in /proc/vmallocinfo, I need a "_caller" variant of it. (akpm: needed for ongoing powerpc development, so merge it early) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Benjamin Herrenschmidt Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 4 ++++ mm/vmalloc.c | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 506e7620a986..9c0890c7a06a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -84,6 +84,10 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct *__get_vm_area_caller(unsigned long size, + unsigned long flags, + unsigned long start, unsigned long end, + void *caller); extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8c..4dd2636d0b92 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1106,6 +1106,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, } EXPORT_SYMBOL_GPL(__get_vm_area); +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + void *caller) +{ + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + caller); +} + /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area From b6d6c5175809934e04a606d9193ef04924a7a7d9 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 18 Feb 2009 14:48:13 -0800 Subject: [PATCH 118/143] aoe: ignore vendor extension AoE responses The Welland ME-747K-SI AoE target generates unsolicited AoE responses that are marked as vendor extensions. Instead of ignoring these packets, the aoe driver was generating kernel messages for each unrecognized response received. This patch corrects the behavior. Signed-off-by: Ed Cashin Reported-by: Tested-by: Cc: Cc: Alex Buell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 1 + drivers/block/aoe/aoenet.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index c237527b1aa5..5e41e6dd657b 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -18,6 +18,7 @@ enum { AOECMD_ATA, AOECMD_CFG, + AOECMD_VEND_MIN = 0xf0, AOEFL_RSP = (1<<3), AOEFL_ERR = (1<<2), diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 30de5b1c647e..c6099ba9a4b8 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -142,6 +142,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, aoecmd_cfg_rsp(skb); break; default: + if (h->cmd >= AOECMD_VEND_MIN) + break; /* don't complain about vendor commands */ printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd); } exit: From b851ee7921fabdd7dfc96ffc4e9609f5062bd12b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 18 Feb 2009 14:48:14 -0800 Subject: [PATCH 119/143] cgroups: update documentation about css_set hash table The css_set hash table was introduced in 2.6.26, so update the documentation accordingly. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index d9e5d6f41b92..93feb8444489 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -252,10 +252,8 @@ cgroup file system directories. When a task is moved from one cgroup to another, it gets a new css_set pointer - if there's an already existing css_set with the desired collection of cgroups then that group is reused, else a new -css_set is allocated. Note that the current implementation uses a -linear search to locate an appropriate existing css_set, so isn't -very efficient. A future version will use a hash table for better -performance. +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. To allow access from a cgroup to the css_sets (and hence tasks) that comprise it, a set of cg_cgroup_link objects form a lattice; From 55ec82176eca52e4e0530a82a0eb59160a1a95a1 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 18 Feb 2009 14:48:15 -0800 Subject: [PATCH 120/143] vfs: separate FMODE_PREAD/FMODE_PWRITE into separate flags Separate FMODE_PREAD and FMODE_PWRITE into separate flags to reflect the reality that the read and write paths may have independent restrictions. A git grep verifies that these flags are always cleared together so this new behavior will only apply to interfaces that change to clear flags individually. This is required for "seq_file: properly cope with pread", a post-2.6.25 regression fix. [akpm@linux-foundation.org: add comment] Signed-off-by: Paul Turner Cc: Eric Biederman Cc: Alexey Dobriyan Cc: Al Viro Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 6022f44043f2..5852bd6afbe4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -54,24 +54,30 @@ struct inodes_stat_t { #define MAY_ACCESS 16 #define MAY_OPEN 32 +/* + * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond + * to O_WRONLY and O_RDWR via the strange trick in __dentry_open() + */ + /* file is open for reading */ #define FMODE_READ ((__force fmode_t)1) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)2) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)4) -/* file can be accessed using pread/pwrite */ +/* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)8) -#define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ +/* file can be accessed using pwrite */ +#define FMODE_PWRITE ((__force fmode_t)16) /* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)16) +#define FMODE_EXEC ((__force fmode_t)32) /* File is opened with O_NDELAY (only set for block devices) */ -#define FMODE_NDELAY ((__force fmode_t)32) +#define FMODE_NDELAY ((__force fmode_t)64) /* File is opened with O_EXCL (only set for block devices) */ -#define FMODE_EXCL ((__force fmode_t)64) +#define FMODE_EXCL ((__force fmode_t)128) /* File is opened using open(.., 3, ..) and is writeable only for ioctls (specialy hack for floppy.c) */ -#define FMODE_WRITE_IOCTL ((__force fmode_t)128) +#define FMODE_WRITE_IOCTL ((__force fmode_t)256) /* * Don't update ctime and mtime. From 8f19d472935c83d823fa4cf02bcc0a7b9952db30 Mon Sep 17 00:00:00 2001 From: Eric Biederman Date: Wed, 18 Feb 2009 14:48:16 -0800 Subject: [PATCH 121/143] seq_file: properly cope with pread Currently seq_read assumes that the offset passed to it is always the offset it passed to user space. In the case pread this assumption is broken and we do the wrong thing when presented with pread. To solve this I introduce an offset cache inside of struct seq_file so we know where our logical file position is. Then in seq_read if we try to read from another offset we reset our data structures and attempt to go to the offset user space wanted. [akpm@linux-foundation.org: restore FMODE_PWRITE] [pjt@google.com: seq_open needs its fmode opened up to take advantage of this] Signed-off-by: Eric Biederman Cc: Alexey Dobriyan Cc: Al Viro Cc: Paul Turner Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 36 ++++++++++++++++++++++++++++++++---- include/linux/seq_file.h | 1 + 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 5267098532bf..a1a4cfe19210 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op) */ file->f_version = 0; - /* SEQ files support lseek, but not pread/pwrite */ - file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); + /* + * seq_files support lseek() and pread(). They do not implement + * write() at all, but we clear FMODE_PWRITE here for historical + * reasons. + * + * If a client of seq_files a) implements file.write() and b) wishes to + * support pwrite() then that client will need to implement its own + * file.open() which calls seq_open() and then sets FMODE_PWRITE. + */ + file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); @@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) int err = 0; mutex_lock(&m->lock); + + /* Don't assume *ppos is where we left it */ + if (unlikely(*ppos != m->read_pos)) { + m->read_pos = *ppos; + while ((err = traverse(m, *ppos)) == -EAGAIN) + ; + if (err) { + /* With prejudice... */ + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + goto Done; + } + } + /* * seq_file->op->..m_start/m_stop/m_next may do special actions * or optimisations based on the file->f_version, so we want to @@ -230,8 +254,10 @@ Fill: Done: if (!copied) copied = err; - else + else { *ppos += copied; + m->read_pos += copied; + } file->f_version = m->version; mutex_unlock(&m->lock); return copied; @@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin) if (offset < 0) break; retval = offset; - if (offset != file->f_pos) { + if (offset != m->read_pos) { while ((retval=traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; + m->read_pos = 0; m->version = 0; m->index = 0; m->count = 0; } else { + m->read_pos = offset; retval = file->f_pos = offset; } } diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 40ea5058c2ec..f616f31576d7 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -19,6 +19,7 @@ struct seq_file { size_t from; size_t count; loff_t index; + loff_t read_pos; u64 version; struct mutex lock; const struct seq_operations *op; From ef35ce231b3cb2a4b1808e826da263bf37ccb38a Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 18 Feb 2009 14:48:16 -0800 Subject: [PATCH 122/143] Pavel has moved My @suse.cz address will stop working some day, so put working one into MAINTAINERS/CREDITS. It would be cool to get this to 2.6.29... it should not really break anything. Signed-off-by: Pavel Machek Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 1 - MAINTAINERS | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CREDITS b/CREDITS index 2b39168c06aa..5e0736722afd 100644 --- a/CREDITS +++ b/CREDITS @@ -2166,7 +2166,6 @@ D: Initial implementation of VC's, pty's and select() N: Pavel Machek E: pavel@ucw.cz -E: pavel@suse.cz D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd D: sun4/330 port, capabilities for elf, speedup for rm on ext2, USB, D: work on suspend-to-ram/disk, killing duplicates from ioctl32 diff --git a/MAINTAINERS b/MAINTAINERS index e784b358b335..a2008bd587c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2001,7 +2001,7 @@ S: Maintained HIBERNATION (aka Software Suspend, aka swsusp) P: Pavel Machek -M: pavel@suse.cz +M: pavel@ucw.cz P: Rafael J. Wysocki M: rjw@sisk.pl L: linux-pm@lists.linux-foundation.org @@ -4172,7 +4172,7 @@ SUSPEND TO RAM P: Len Brown M: len.brown@intel.com P: Pavel Machek -M: pavel@suse.cz +M: pavel@ucw.cz P: Rafael J. Wysocki M: rjw@sisk.pl L: linux-pm@lists.linux-foundation.org From 610d18f4128ebbd88845d0fc60cce67b49af881e Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: [PATCH 123/143] timerfd: add flags check As requested by Michael, add a missing check for valid flags in timerfd_settime(), and make it return EINVAL in case some extra bits are set. Michael said: If this is to be any use to userland apps that want to check flag support (perhaps it is too late already), then the sooner we get it into the kernel the better: 2.6.29 would be good; earlier stables as well would be even better. [akpm@linux-foundation.org: remove unused TFD_FLAGS_SET] Acked-by: Michael Kerrisk Signed-off-by: Davide Libenzi Cc: [2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/timerfd.c | 12 ++++++------ include/linux/timerfd.h | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a123b8ff3f5..b042bd7034b1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); - if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) - return -EINVAL; - if (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME) + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + flags & TFD_SHARED_FCNTL_FLAGS); if (ufd < 0) kfree(ctx); @@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; - if (!timespec_valid(&ktmr.it_value) || + if ((flags & ~TFD_SETTIME_FLAGS) || + !timespec_valid(&ktmr.it_value) || !timespec_valid(&ktmr.it_interval)) return -EINVAL; diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index 86cb0501d3e2..2d0792983f8c 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -11,13 +11,21 @@ /* For O_CLOEXEC and O_NONBLOCK */ #include -/* Flags for timerfd_settime. */ +/* + * CAREFUL: Check include/asm-generic/fcntl.h when defining + * new flags, since they might collide with O_* ones. We want + * to re-use O_* flags that couldn't possibly have a meaning + * from eventfd, in order to leave a free define-space for + * shared O_* flags. + */ #define TFD_TIMER_ABSTIME (1 << 0) - -/* Flags for timerfd_create. */ #define TFD_CLOEXEC O_CLOEXEC #define TFD_NONBLOCK O_NONBLOCK +#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) +/* Flags for timerfd_create. */ +#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS +/* Flags for timerfd_settime. */ +#define TFD_SETTIME_FLAGS TFD_TIMER_ABSTIME #endif /* _LINUX_TIMERFD_H */ - From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: [PATCH 124/143] mm: task dirty accounting fix YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty). Additionally, there is some inconsistency about when task_dirty_inc is called. It is used for dirty balancing, however it even gets called for __set_page_dirty_no_writeback. So rather than increment it in a set_page_dirty wrapper, move it down to exactly where the dirty page accounting stats are incremented. Cc: YAMAMOTO Takashi Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + include/linux/mm.h | 1 + mm/page-writeback.c | 13 +++---------- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 665d446b25bc..ff4d1cdd779b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page, __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, diff --git a/include/linux/mm.h b/include/linux/mm.h index 7dc04ff5ab89..10074212a35b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1159,6 +1159,7 @@ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); +void task_dirty_inc(struct task_struct *tsk); /* readahead.c */ #define VM_MAX_READAHEAD 128 /* kbytes */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3c84128596ba..74dc57c74349 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -static inline void task_dirty_inc(struct task_struct *tsk) +void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); } @@ -1230,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page) __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -1262,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -static int __set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -1280,14 +1281,6 @@ static int __set_page_dirty(struct page *page) } return 0; } - -int set_page_dirty(struct page *page) -{ - int ret = __set_page_dirty(page); - if (ret) - task_dirty_inc(current); - return ret; -} EXPORT_SYMBOL(set_page_dirty); /* From 67e055d144c5b2acdc1c63811fde031263bf92c5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 18 Feb 2009 14:48:20 -0800 Subject: [PATCH 125/143] cgroups: fix possible use after free In cgroup_kill_sb(), root is freed before sb is detached from the list, so another sget() may find this sb and call cgroup_test_super(), which will access the root that has been freed. Reported-by: Al Viro Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e14db9c089b9..9edb5c4b79b4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1122,8 +1122,8 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); - kfree(root); kill_litter_super(sb); + kfree(root); } static struct file_system_type cgroup_fs_type = { From 42f5e039c3f6512271636928ddc4e7f7a0371672 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 18 Feb 2009 14:48:21 -0800 Subject: [PATCH 126/143] pm: fix build for CONFIG_PM unset Compilation of kprobes.c with CONFIG_PM unset is broken due to some broken config dependncies. Fix that. Signed-off-by: Rafael J. Wysocki Reported-by: Ingo Molnar Tested-by: Masami Hiramatsu Cc: Len Brown Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/power/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 170a9213c1b6..e4791b3ba55d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ +obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o diff --git a/kernel/power/Makefile b/kernel/power/Makefile index d7a10167a25b..720ea4f781bd 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,7 +3,7 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -obj-y := main.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o From 3a5093ee6728c8cbe9c039e685fc1fca8f965048 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 18 Feb 2009 14:48:22 -0800 Subject: [PATCH 127/143] eeepc: should depend on INPUT Otherwise with INPUT=m, EEEPC_LAPTOP=y one gets drivers/built-in.o: In function `input_sync': eeepc-laptop.c:(.text+0x18ce51): undefined reference to `input_event' drivers/built-in.o: In function `input_report_key': eeepc-laptop.c:(.text+0x18ce73): undefined reference to `input_event' drivers/built-in.o: In function `eeepc_hotk_check': eeepc-laptop.c:(.text+0x18d05f): undefined reference to `input_allocate_device' eeepc-laptop.c:(.text+0x18d10f): undefined reference to `input_register_device' eeepc-laptop.c:(.text+0x18d131): undefined reference to `input_free_device' drivers/built-in.o: In function `eeepc_backlight_exit': eeepc-laptop.c:(.text+0x18d546): undefined reference to `input_unregister_device' Signed-off-by: Alexey Dobriyan Cc: Len Brown Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 94363115a42a..6bce56c22623 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -301,6 +301,7 @@ config INTEL_MENLOW config EEEPC_LAPTOP tristate "Eee PC Hotkey Driver (EXPERIMENTAL)" depends on ACPI + depends on INPUT depends on EXPERIMENTAL select BACKLIGHT_CLASS_DEVICE select HWMON From ef2cfc790bf5f0ff189b01eabc0f4feb5e8524df Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 18 Feb 2009 14:48:23 -0800 Subject: [PATCH 128/143] hp accelerometer: add freefall detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds freefall handling to hp_accel driver. According to HP, it should just work, without us having to set the chip up by hand. hpfall.c is example .c program that parks the disk when accelerometer detects free fall. It should work; for now, it uses fixed 20seconds protection period. Signed-off-by: Pavel Machek Cc: Thomas Renninger Cc: Éric Piel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/hwmon/hpfall.c | 101 ++++++++++++++++++++ Documentation/hwmon/lis3lv02d | 8 ++ drivers/hwmon/hp_accel.c | 27 ++++++ drivers/hwmon/lis3lv02d.c | 172 ++++++++++++++++++++++++++++++---- drivers/hwmon/lis3lv02d.h | 5 + 5 files changed, 296 insertions(+), 17 deletions(-) create mode 100644 Documentation/hwmon/hpfall.c diff --git a/Documentation/hwmon/hpfall.c b/Documentation/hwmon/hpfall.c new file mode 100644 index 000000000000..bbea1ccfd46a --- /dev/null +++ b/Documentation/hwmon/hpfall.c @@ -0,0 +1,101 @@ +/* Disk protection for HP machines. + * + * Copyright 2008 Eric Piel + * Copyright 2009 Pavel Machek + * + * GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void write_int(char *path, int i) +{ + char buf[1024]; + int fd = open(path, O_RDWR); + if (fd < 0) { + perror("open"); + exit(1); + } + sprintf(buf, "%d", i); + if (write(fd, buf, strlen(buf)) != strlen(buf)) { + perror("write"); + exit(1); + } + close(fd); +} + +void set_led(int on) +{ + write_int("/sys/class/leds/hp::hddprotect/brightness", on); +} + +void protect(int seconds) +{ + write_int("/sys/block/sda/device/unload_heads", seconds*1000); +} + +int on_ac(void) +{ +// /sys/class/power_supply/AC0/online +} + +int lid_open(void) +{ +// /proc/acpi/button/lid/LID/state +} + +void ignore_me(void) +{ + protect(0); + set_led(0); + +} + +int main(int argc, char* argv[]) +{ + int fd, ret; + + fd = open("/dev/freefall", O_RDONLY); + if (fd < 0) { + perror("open"); + return EXIT_FAILURE; + } + + signal(SIGALRM, ignore_me); + + for (;;) { + unsigned char count; + + ret = read(fd, &count, sizeof(count)); + alarm(0); + if ((ret == -1) && (errno == EINTR)) { + /* Alarm expired, time to unpark the heads */ + continue; + } + + if (ret != sizeof(count)) { + perror("read"); + break; + } + + protect(21); + set_led(1); + if (1 || on_ac() || lid_open()) { + alarm(2); + } else { + alarm(20); + } + } + + close(fd); + return EXIT_SUCCESS; +} diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d index 0fcfc4a7ccdc..287f8c902656 100644 --- a/Documentation/hwmon/lis3lv02d +++ b/Documentation/hwmon/lis3lv02d @@ -33,6 +33,14 @@ rate - reports the sampling rate of the accelerometer device in HZ This driver also provides an absolute input class device, allowing the laptop to act as a pinball machine-esque joystick. +Another feature of the driver is misc device called "freefall" that +acts similar to /dev/rtc and reacts on free-fall interrupts received +from the device. It supports blocking operations, poll/select and +fasync operation modes. You must read 1 bytes from the device. The +result is number of free-fall interrupts since the last successful +read (or 255 if number of interrupts would not fit). + + Axes orientation ---------------- diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index abf4dfc8ec22..dcefe1d2adbd 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -213,6 +213,30 @@ static struct delayed_led_classdev hpled_led = { .set_brightness = hpled_set, }; +static acpi_status +lis3lv02d_get_resource(struct acpi_resource *resource, void *context) +{ + if (resource->type == ACPI_RESOURCE_TYPE_EXTENDED_IRQ) { + struct acpi_resource_extended_irq *irq; + u32 *device_irq = context; + + irq = &resource->data.extended_irq; + *device_irq = irq->interrupts[0]; + } + + return AE_OK; +} + +static void lis3lv02d_enum_resources(struct acpi_device *device) +{ + acpi_status status; + + status = acpi_walk_resources(device->handle, METHOD_NAME__CRS, + lis3lv02d_get_resource, &adev.irq); + if (ACPI_FAILURE(status)) + printk(KERN_DEBUG DRIVER_NAME ": Error getting resources\n"); +} + static int lis3lv02d_add(struct acpi_device *device) { u8 val; @@ -247,6 +271,9 @@ static int lis3lv02d_add(struct acpi_device *device) if (ret) return ret; + /* obtain IRQ number of our device from ACPI */ + lis3lv02d_enum_resources(adev.device); + ret = lis3lv02d_init_device(&adev); if (ret) { flush_work(&hpled_led.work); diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index 219d2d0d5a62..3afa3afc77f3 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -3,7 +3,7 @@ * * Copyright (C) 2007-2008 Yan Burman * Copyright (C) 2008 Eric Piel - * Copyright (C) 2008 Pavel Machek + * Copyright (C) 2008-2009 Pavel Machek * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include "lis3lv02d.h" @@ -55,7 +56,10 @@ /* Maximum value our axis may get for the input device (signed 12 bits) */ #define MDPS_MAX_VAL 2048 -struct acpi_lis3lv02d adev; +struct acpi_lis3lv02d adev = { + .misc_wait = __WAIT_QUEUE_HEAD_INITIALIZER(adev.misc_wait), +}; + EXPORT_SYMBOL_GPL(adev); static int lis3lv02d_add_fs(struct acpi_device *device); @@ -110,26 +114,13 @@ static void lis3lv02d_get_xyz(acpi_handle handle, int *x, int *y, int *z) void lis3lv02d_poweroff(acpi_handle handle) { adev.is_on = 0; - /* disable X,Y,Z axis and power down */ - adev.write(handle, CTRL_REG1, 0x00); } EXPORT_SYMBOL_GPL(lis3lv02d_poweroff); void lis3lv02d_poweron(acpi_handle handle) { - u8 val; - adev.is_on = 1; adev.init(handle); - adev.write(handle, FF_WU_CFG, 0); - /* - * BDU: LSB and MSB values are not updated until both have been read. - * So the value read will always be correct. - * IEN: Interrupt for free-fall and DD, not for data-ready. - */ - adev.read(handle, CTRL_REG2, &val); - val |= CTRL2_BDU | CTRL2_IEN; - adev.write(handle, CTRL_REG2, val); } EXPORT_SYMBOL_GPL(lis3lv02d_poweron); @@ -162,6 +153,140 @@ static void lis3lv02d_decrease_use(struct acpi_lis3lv02d *dev) mutex_unlock(&dev->lock); } +static irqreturn_t lis302dl_interrupt(int irq, void *dummy) +{ + /* + * Be careful: on some HP laptops the bios force DD when on battery and + * the lid is closed. This leads to interrupts as soon as a little move + * is done. + */ + atomic_inc(&adev.count); + + wake_up_interruptible(&adev.misc_wait); + kill_fasync(&adev.async_queue, SIGIO, POLL_IN); + return IRQ_HANDLED; +} + +static int lis3lv02d_misc_open(struct inode *inode, struct file *file) +{ + int ret; + + if (test_and_set_bit(0, &adev.misc_opened)) + return -EBUSY; /* already open */ + + atomic_set(&adev.count, 0); + + /* + * The sensor can generate interrupts for free-fall and direction + * detection (distinguishable with FF_WU_SRC and DD_SRC) but to keep + * the things simple and _fast_ we activate it only for free-fall, so + * no need to read register (very slow with ACPI). For the same reason, + * we forbid shared interrupts. + * + * IRQF_TRIGGER_RISING seems pointless on HP laptops because the + * io-apic is not configurable (and generates a warning) but I keep it + * in case of support for other hardware. + */ + ret = request_irq(adev.irq, lis302dl_interrupt, IRQF_TRIGGER_RISING, + DRIVER_NAME, &adev); + + if (ret) { + clear_bit(0, &adev.misc_opened); + printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", adev.irq); + return -EBUSY; + } + lis3lv02d_increase_use(&adev); + printk("lis3: registered interrupt %d\n", adev.irq); + return 0; +} + +static int lis3lv02d_misc_release(struct inode *inode, struct file *file) +{ + fasync_helper(-1, file, 0, &adev.async_queue); + lis3lv02d_decrease_use(&adev); + free_irq(adev.irq, &adev); + clear_bit(0, &adev.misc_opened); /* release the device */ + return 0; +} + +static ssize_t lis3lv02d_misc_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + DECLARE_WAITQUEUE(wait, current); + u32 data; + unsigned char byte_data; + ssize_t retval = 1; + + if (count < 1) + return -EINVAL; + + add_wait_queue(&adev.misc_wait, &wait); + while (true) { + set_current_state(TASK_INTERRUPTIBLE); + data = atomic_xchg(&adev.count, 0); + if (data) + break; + + if (file->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + goto out; + } + + if (signal_pending(current)) { + retval = -ERESTARTSYS; + goto out; + } + + schedule(); + } + + if (data < 255) + byte_data = data; + else + byte_data = 255; + + /* make sure we are not going into copy_to_user() with + * TASK_INTERRUPTIBLE state */ + set_current_state(TASK_RUNNING); + if (copy_to_user(buf, &byte_data, sizeof(byte_data))) + retval = -EFAULT; + +out: + __set_current_state(TASK_RUNNING); + remove_wait_queue(&adev.misc_wait, &wait); + + return retval; +} + +static unsigned int lis3lv02d_misc_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &adev.misc_wait, wait); + if (atomic_read(&adev.count)) + return POLLIN | POLLRDNORM; + return 0; +} + +static int lis3lv02d_misc_fasync(int fd, struct file *file, int on) +{ + return fasync_helper(fd, file, on, &adev.async_queue); +} + +static const struct file_operations lis3lv02d_misc_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = lis3lv02d_misc_read, + .open = lis3lv02d_misc_open, + .release = lis3lv02d_misc_release, + .poll = lis3lv02d_misc_poll, + .fasync = lis3lv02d_misc_fasync, +}; + +static struct miscdevice lis3lv02d_misc_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "freefall", + .fops = &lis3lv02d_misc_fops, +}; + /** * lis3lv02d_joystick_kthread - Kthread polling function * @data: unused - here to conform to threadfn prototype @@ -203,7 +328,6 @@ static void lis3lv02d_joystick_close(struct input_dev *input) lis3lv02d_decrease_use(&adev); } - static inline void lis3lv02d_calibrate_joystick(void) { lis3lv02d_get_xyz(adev.device->handle, &adev.xcalib, &adev.ycalib, &adev.zcalib); @@ -250,6 +374,7 @@ void lis3lv02d_joystick_disable(void) if (!adev.idev) return; + misc_deregister(&lis3lv02d_misc_device); input_unregister_device(adev.idev); adev.idev = NULL; } @@ -268,6 +393,19 @@ int lis3lv02d_init_device(struct acpi_lis3lv02d *dev) if (lis3lv02d_joystick_enable()) printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n"); + printk("lis3_init_device: irq %d\n", dev->irq); + + /* if we did not get an IRQ from ACPI - we have nothing more to do */ + if (!dev->irq) { + printk(KERN_ERR DRIVER_NAME + ": No IRQ in ACPI. Disabling /dev/freefall\n"); + goto out; + } + + printk("lis3: registering device\n"); + if (misc_register(&lis3lv02d_misc_device)) + printk(KERN_ERR DRIVER_NAME ": misc_register failed\n"); +out: lis3lv02d_decrease_use(dev); return 0; } @@ -351,6 +489,6 @@ int lis3lv02d_remove_fs(void) EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs); MODULE_DESCRIPTION("ST LIS3LV02Dx three-axis digital accelerometer driver"); -MODULE_AUTHOR("Yan Burman and Eric Piel"); +MODULE_AUTHOR("Yan Burman, Eric Piel, Pavel Machek"); MODULE_LICENSE("GPL"); diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h index 223f1c0763bb..2e7597c42d80 100644 --- a/drivers/hwmon/lis3lv02d.h +++ b/drivers/hwmon/lis3lv02d.h @@ -170,6 +170,11 @@ struct acpi_lis3lv02d { unsigned char is_on; /* whether the device is on or off */ unsigned char usage; /* usage counter */ struct axis_conversion ac; /* hw -> logical axis */ + + u32 irq; /* IRQ number */ + struct fasync_struct *async_queue; /* queue for the misc device */ + wait_queue_head_t misc_wait; /* Wait queue for the misc device */ + unsigned long misc_opened; /* bit0: whether the device is open */ }; int lis3lv02d_init_device(struct acpi_lis3lv02d *dev); From 137bad32342a613586347341d1307c2b9812ef44 Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Wed, 18 Feb 2009 14:48:24 -0800 Subject: [PATCH 129/143] lis3lv02d: support both one- and two-byte sensors Sensors responding with 0x3B to WHO_AM_I only have one data register per direction, thus returning a signed byte from the position which is occupied by the MSB in sensors responding with 0x3A. Since multiple sensors share the reply to WHO_AM_I, we rename the defines to better indicate what they identify (family of single and double precision sensors). We support both kind of sensors by checking for the sensor type on init and defining appropriate data-access routines and sensor limits (for the joystick) depending on what we find. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Giuseppe Bilotta Acked-by: Eric Piel Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/hp_accel.c | 36 ++++++++++++++++++++++++++++++++---- drivers/hwmon/lis3lv02d.c | 25 ++++++------------------- drivers/hwmon/lis3lv02d.h | 16 +++++++++++++--- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index dcefe1d2adbd..6b16566c4e6c 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -237,9 +237,25 @@ static void lis3lv02d_enum_resources(struct acpi_device *device) printk(KERN_DEBUG DRIVER_NAME ": Error getting resources\n"); } +static s16 lis3lv02d_read_16(acpi_handle handle, int reg) +{ + u8 lo, hi; + + adev.read(handle, reg - 1, &lo); + adev.read(handle, reg, &hi); + /* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */ + return (s16)((hi << 8) | lo); +} + +static s16 lis3lv02d_read_8(acpi_handle handle, int reg) +{ + s8 lo; + adev.read(handle, reg, &lo); + return lo; +} + static int lis3lv02d_add(struct acpi_device *device) { - u8 val; int ret; if (!device) @@ -253,10 +269,22 @@ static int lis3lv02d_add(struct acpi_device *device) strcpy(acpi_device_class(device), ACPI_MDPS_CLASS); device->driver_data = &adev; - lis3lv02d_acpi_read(device->handle, WHO_AM_I, &val); - if ((val != LIS3LV02DL_ID) && (val != LIS302DL_ID)) { + lis3lv02d_acpi_read(device->handle, WHO_AM_I, &adev.whoami); + switch (adev.whoami) { + case LIS_DOUBLE_ID: + printk(KERN_INFO DRIVER_NAME ": 2-byte sensor found\n"); + adev.read_data = lis3lv02d_read_16; + adev.mdps_max_val = 2048; + break; + case LIS_SINGLE_ID: + printk(KERN_INFO DRIVER_NAME ": 1-byte sensor found\n"); + adev.read_data = lis3lv02d_read_8; + adev.mdps_max_val = 128; + break; + default: printk(KERN_ERR DRIVER_NAME - ": Accelerometer chip not LIS3LV02D{L,Q}\n"); + ": unknown sensor type 0x%X\n", adev.whoami); + return -EINVAL; } /* If possible use a "standard" axes order */ diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index 3afa3afc77f3..8bb2158f0453 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -53,9 +53,6 @@ * joystick. */ -/* Maximum value our axis may get for the input device (signed 12 bits) */ -#define MDPS_MAX_VAL 2048 - struct acpi_lis3lv02d adev = { .misc_wait = __WAIT_QUEUE_HEAD_INITIALIZER(adev.misc_wait), }; @@ -64,16 +61,6 @@ EXPORT_SYMBOL_GPL(adev); static int lis3lv02d_add_fs(struct acpi_device *device); -static s16 lis3lv02d_read_16(acpi_handle handle, int reg) -{ - u8 lo, hi; - - adev.read(handle, reg, &lo); - adev.read(handle, reg + 1, &hi); - /* In "12 bit right justified" mode, bit 6, bit 7, bit 8 = bit 5 */ - return (s16)((hi << 8) | lo); -} - /** * lis3lv02d_get_axis - For the given axis, give the value converted * @axis: 1,2,3 - can also be negative @@ -102,9 +89,9 @@ static void lis3lv02d_get_xyz(acpi_handle handle, int *x, int *y, int *z) { int position[3]; - position[0] = lis3lv02d_read_16(handle, OUTX_L); - position[1] = lis3lv02d_read_16(handle, OUTY_L); - position[2] = lis3lv02d_read_16(handle, OUTZ_L); + position[0] = adev.read_data(handle, OUTX); + position[1] = adev.read_data(handle, OUTY); + position[2] = adev.read_data(handle, OUTZ); *x = lis3lv02d_get_axis(adev.ac.x, position); *y = lis3lv02d_get_axis(adev.ac.y, position); @@ -355,9 +342,9 @@ int lis3lv02d_joystick_enable(void) adev.idev->close = lis3lv02d_joystick_close; set_bit(EV_ABS, adev.idev->evbit); - input_set_abs_params(adev.idev, ABS_X, -MDPS_MAX_VAL, MDPS_MAX_VAL, 3, 3); - input_set_abs_params(adev.idev, ABS_Y, -MDPS_MAX_VAL, MDPS_MAX_VAL, 3, 3); - input_set_abs_params(adev.idev, ABS_Z, -MDPS_MAX_VAL, MDPS_MAX_VAL, 3, 3); + input_set_abs_params(adev.idev, ABS_X, -adev.mdps_max_val, adev.mdps_max_val, 3, 3); + input_set_abs_params(adev.idev, ABS_Y, -adev.mdps_max_val, adev.mdps_max_val, 3, 3); + input_set_abs_params(adev.idev, ABS_Z, -adev.mdps_max_val, adev.mdps_max_val, 3, 3); err = input_register_device(adev.idev); if (err) { diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h index 2e7597c42d80..75972bf372ff 100644 --- a/drivers/hwmon/lis3lv02d.h +++ b/drivers/hwmon/lis3lv02d.h @@ -22,12 +22,15 @@ /* * The actual chip is STMicroelectronics LIS3LV02DL or LIS3LV02DQ that seems to * be connected via SPI. There exists also several similar chips (such as LIS302DL or - * LIS3L02DQ) but not in the HP laptops and they have slightly different registers. + * LIS3L02DQ) and they have slightly different registers, but we can provide a + * common interface for all of them. * They can also be connected via I²C. */ -#define LIS3LV02DL_ID 0x3A /* Also the LIS3LV02DQ */ -#define LIS302DL_ID 0x3B /* Also the LIS202DL! */ +/* 2-byte registers */ +#define LIS_DOUBLE_ID 0x3A /* LIS3LV02D[LQ] */ +/* 1-byte registers */ +#define LIS_SINGLE_ID 0x3B /* LIS[32]02DL and others */ enum lis3lv02d_reg { WHO_AM_I = 0x0F, @@ -44,10 +47,13 @@ enum lis3lv02d_reg { STATUS_REG = 0x27, OUTX_L = 0x28, OUTX_H = 0x29, + OUTX = 0x29, OUTY_L = 0x2A, OUTY_H = 0x2B, + OUTY = 0x2B, OUTZ_L = 0x2C, OUTZ_H = 0x2D, + OUTZ = 0x2D, FF_WU_CFG = 0x30, FF_WU_SRC = 0x31, FF_WU_ACK = 0x32, @@ -159,6 +165,10 @@ struct acpi_lis3lv02d { acpi_status (*write) (acpi_handle handle, int reg, u8 val); acpi_status (*read) (acpi_handle handle, int reg, u8 *ret); + u8 whoami; /* 3Ah: 2-byte registries, 3Bh: 1-byte registries */ + s16 (*read_data) (acpi_handle handle, int reg); + int mdps_max_val; + struct input_dev *idev; /* input device */ struct task_struct *kthread; /* kthread for input */ struct mutex lock; From 9ccf3b5e8409927835c4d38cb2f380c9e4349e76 Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Wed, 18 Feb 2009 14:48:25 -0800 Subject: [PATCH 130/143] lis3lv02d: add axes knowledge of HP Pavilion dv5 models Add support for HP Pavilion dv5. Since Intel-based models have an inverted x axis, while AMD-based models have an inverted y axis, we introduce a new macro that special-cases axis orientation based on two DMI entries: HP dv5 axis configuration is then based on both the PRODUCT and BOARD name. Signed-off-by: Giuseppe Bilotta Cc: Eric Piel Cc: Pavel Machek Tested-by: Palatis Tseng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/hp_accel.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/hwmon/hp_accel.c b/drivers/hwmon/hp_accel.c index 6b16566c4e6c..29c83b5b9697 100644 --- a/drivers/hwmon/hp_accel.c +++ b/drivers/hwmon/hp_accel.c @@ -166,6 +166,18 @@ static struct axis_conversion lis3lv02d_axis_xy_swap_yz_inverted = {2, -1, -3}; }, \ .driver_data = &lis3lv02d_axis_##_axis \ } + +#define AXIS_DMI_MATCH2(_ident, _class1, _name1, \ + _class2, _name2, \ + _axis) { \ + .ident = _ident, \ + .callback = lis3lv02d_dmi_matched, \ + .matches = { \ + DMI_MATCH(DMI_##_class1, _name1), \ + DMI_MATCH(DMI_##_class2, _name2), \ + }, \ + .driver_data = &lis3lv02d_axis_##_axis \ +} static struct dmi_system_id lis3lv02d_dmi_ids[] = { /* product names are truncated to match all kinds of a same model */ AXIS_DMI_MATCH("NC64x0", "HP Compaq nc64", x_inverted), @@ -179,6 +191,16 @@ static struct dmi_system_id lis3lv02d_dmi_ids[] = { AXIS_DMI_MATCH("NC673x", "HP Compaq 673", xy_rotated_left_usd), AXIS_DMI_MATCH("NC651xx", "HP Compaq 651", xy_rotated_right), AXIS_DMI_MATCH("NC671xx", "HP Compaq 671", xy_swap_yz_inverted), + /* Intel-based HP Pavilion dv5 */ + AXIS_DMI_MATCH2("HPDV5_I", + PRODUCT_NAME, "HP Pavilion dv5", + BOARD_NAME, "3603", + x_inverted), + /* AMD-based HP Pavilion dv5 */ + AXIS_DMI_MATCH2("HPDV5_A", + PRODUCT_NAME, "HP Pavilion dv5", + BOARD_NAME, "3600", + y_inverted), { NULL, } /* Laptop models without axis info (yet): * "NC6910" "HP Compaq 6910" From 287d859222e0adbc67666a6154aaf42d7d5bbb54 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 18 Feb 2009 14:48:26 -0800 Subject: [PATCH 131/143] atmel-mci: fix initialization of dma slave data The conversion of atmel-mci to dma_request_channel missed the initialization of the channel dma_slave information. The filter_fn passed to dma_request_channel is responsible for initializing the channel's private data. This implementation has the additional benefit of enabling a generic client-channel data passing mechanism. Reviewed-by: Atsushi Nemoto Signed-off-by: Dan Williams Acked-by: Haavard Skinnemoen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dma/dmaengine.c | 2 ++ drivers/dma/dw_dmac.c | 5 ++--- drivers/dma/dw_dmac_regs.h | 2 -- drivers/mmc/host/atmel-mci.c | 5 +++-- include/linux/dmaengine.h | 2 ++ 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index a58993011edb..280a9d263eb3 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -518,6 +518,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v dma_chan_name(chan), err); else break; + chan->private = NULL; chan = NULL; } } @@ -536,6 +537,7 @@ void dma_release_channel(struct dma_chan *chan) WARN_ONCE(chan->client_count != 1, "chan reference count %d != 1\n", chan->client_count); dma_chan_put(chan); + chan->private = NULL; mutex_unlock(&dma_list_mutex); } EXPORT_SYMBOL_GPL(dma_release_channel); diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index 6b702cc46b3d..a97c07eef7ec 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -560,7 +560,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned long flags) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); - struct dw_dma_slave *dws = dwc->dws; + struct dw_dma_slave *dws = chan->private; struct dw_desc *prev; struct dw_desc *first; u32 ctllo; @@ -790,7 +790,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) cfghi = DWC_CFGH_FIFO_MODE; cfglo = 0; - dws = dwc->dws; + dws = chan->private; if (dws) { /* * We need controller-specific data to set up slave @@ -866,7 +866,6 @@ static void dwc_free_chan_resources(struct dma_chan *chan) spin_lock_bh(&dwc->lock); list_splice_init(&dwc->free_list, &list); dwc->descs_allocated = 0; - dwc->dws = NULL; /* Disable interrupts */ channel_clear_bit(dw, MASK.XFER, dwc->mask); diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h index 00fdd187bb0c..b252b202c5cf 100644 --- a/drivers/dma/dw_dmac_regs.h +++ b/drivers/dma/dw_dmac_regs.h @@ -139,8 +139,6 @@ struct dw_dma_chan { struct list_head queue; struct list_head free_list; - struct dw_dma_slave *dws; - unsigned int descs_allocated; }; diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 76bfe16c09b1..2b1196e6142c 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -1548,9 +1548,10 @@ static bool filter(struct dma_chan *chan, void *slave) { struct dw_dma_slave *dws = slave; - if (dws->dma_dev == chan->device->dev) + if (dws->dma_dev == chan->device->dev) { + chan->private = dws; return true; - else + } else return false; } #endif diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3e68469c1885..f0413845f20e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -121,6 +121,7 @@ struct dma_chan_percpu { * @local: per-cpu pointer to a struct dma_chan_percpu * @client-count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table + * @private: private data for certain client-channel associations */ struct dma_chan { struct dma_device *device; @@ -134,6 +135,7 @@ struct dma_chan { struct dma_chan_percpu *local; int client_count; int table_count; + void *private; }; /** From 27c0c8e511fa9e2389503926840fac606d90a049 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Wed, 18 Feb 2009 14:48:28 -0800 Subject: [PATCH 132/143] atmel_serial might lose modem status change I found a problem of handling of modem status of atmel_serial driver. With the commit 1ecc26 ("atmel_serial: split the interrupt handler"), handling of modem status signal was splitted into two parts. The atmel_tasklet_func() compares new status with irq_status_prev, but irq_status_prev is not correct if signal status was changed while the port is closed. Here is a sequence to cause problem: 1. Remote side sets CTS (and DSR). 2. Local side close the port. 3. Local side clears RTS and DTR. 4. Remote side clears CTS and DSR. 5. Local side reopen the port. hw_stopped becomes 1. 6. Local side sets RTS and DTR. 7. Remote side sets CTS and DSR. Then CTS change interrupt can be received, but since CTS bit in irq_status_prev and new status is same, uart_handle_cts_change() will not be called (so hw_stopped will not be cleared, i.e. cannot send any data). I suppose irq_status_prev should be initialized at somewhere in open sequence. Itai Levi pointed out that we need to initialize atmel_port->irq_status as well here. His analysis is as follows: > Regarding the second part of the patch (which resets irq_status_prev), > it turns out that both versions of the patch (mine and Atsushi's) > still leave enough room for faulty behavior when opening the port. > > This is because we are not resetting both irq_status_prev and > irq_status in atmel_startup() to CSR, which leads faulty behavior in > the following sequences: > > First case: > 1. closing the port while CTS line = 1 (TX not allowed) > 2. setting CTS line = 0 (TX allowed) > 3. opening the port > 4. transmitting one char > 5. Cannot transmit more chars, although CTS line is 0 > > Second case: > 1. closing the port while CTS line = 0 (TX allowed) > 2. setting CTS line = 1 (TX not allowed) > 3. opening the port > 4. receiving some chars > 5. Now we can transmit, although CTS line is 1 > > This reason for this is that the tasklet is scheduled as a result of > TX or RX interrupts (not a status change!), in steps 4 above. Inside > the tasklet, the atmel_port->irq_status (which holds the value from > the previous session) is compared to atmel_port->irq_status_prev. > Hence, a status-change of the CTS line is faultily detected. > > Both cases were verified on 9260 hardware. [haavard.skinnemoen@atmel.com: folded with patch from Itai Levi] Signed-off-by: Atsushi Nemoto Signed-off-by: Haavard Skinnemoen Cc: Remy Bohmer Cc: Marc Pignat Cc: Itai Levi Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/atmel_serial.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c index 89362d733d62..8f58f7ff0dd7 100644 --- a/drivers/serial/atmel_serial.c +++ b/drivers/serial/atmel_serial.c @@ -877,6 +877,10 @@ static int atmel_startup(struct uart_port *port) } } + /* Save current CSR for comparison in atmel_tasklet_func() */ + atmel_port->irq_status_prev = UART_GET_CSR(port); + atmel_port->irq_status = atmel_port->irq_status_prev; + /* * Finally, enable the serial port */ From ada723dcd681e2dffd7d73345cc8fda0eb0df9bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Feb 2009 14:48:30 -0800 Subject: [PATCH 133/143] fs/super.c: add lockdep annotation to s_umount Li Zefan said: Thread 1: for ((; ;)) { mount -t cpuset xxx /mnt > /dev/null 2>&1 cat /mnt/cpus > /dev/null 2>&1 umount /mnt > /dev/null 2>&1 } Thread 2: for ((; ;)) { mount -t cpuset xxx /mnt > /dev/null 2>&1 umount /mnt > /dev/null 2>&1 } (Note: It is irrelevant which cgroup subsys is used.) After a while a lockdep warning showed up: ============================================= [ INFO: possible recursive locking detected ] 2.6.28 #479 --------------------------------------------- mount/13554 is trying to acquire lock: (&type->s_umount_key#19){--..}, at: [] sget+0x5e/0x321 but task is already holding lock: (&type->s_umount_key#19){--..}, at: [] sget+0x1e2/0x321 other info that might help us debug this: 1 lock held by mount/13554: #0: (&type->s_umount_key#19){--..}, at: [] sget+0x1e2/0x321 stack backtrace: Pid: 13554, comm: mount Not tainted 2.6.28-mc #479 Call Trace: [] validate_chain+0x4c6/0xbbd [] __lock_acquire+0x676/0x700 [] lock_acquire+0x5d/0x7a [] ? sget+0x5e/0x321 [] down_write+0x34/0x50 [] ? sget+0x5e/0x321 [] sget+0x5e/0x321 [] ? cgroup_set_super+0x0/0x3e [] ? cgroup_test_super+0x0/0x2f [] cgroup_get_sb+0x98/0x2e7 [] cpuset_get_sb+0x4a/0x5f [] vfs_kern_mount+0x40/0x7b [] do_kern_mount+0x37/0xbf [] do_mount+0x5c3/0x61a [] ? copy_mount_options+0x2c/0x111 [] sys_mount+0x69/0xa0 [] sysenter_do_call+0x12/0x31 The cause is after alloc_super() and then retry, an old entry in list fs_supers is found, so grab_super(old) is called, but both functions hold s_umount lock: struct super_block *sget(...) { ... retry: spin_lock(&sb_lock); if (test) { list_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) <--- 2nd: down_write(&old->s_umount); goto retry; if (s) destroy_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type); <--- 1th: down_write(&s->s_umount) if (!s) return ERR_PTR(-ENOMEM); goto retry; } ... } It seems like a false positive, and seems like VFS but not cgroup needs to be fixed. Peter said: We can simply put the new s_umount instance in a but lockdep doesn't particularly cares about subclass order. If there's any issue with the callers of sget() assuming the s_umount lock being of sublcass 0, then there is another annotation we can use to fix that, but lets not bother with that if this is sufficient. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12673 Signed-off-by: Peter Zijlstra Tested-by: Li Zefan Reported-by: Li Zefan Cc: Al Viro Cc: Paul Menage Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/super.c b/fs/super.c index 61dce001dd57..8349ed6b1412 100644 --- a/fs/super.c +++ b/fs/super.c @@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type) * lock ordering than usbfs: */ lockdep_set_class(&s->s_lock, &type->s_lock_key); - down_write(&s->s_umount); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); From f2dbcfa738368c8a40d4a5f0b65dc9879577cb21 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 18 Feb 2009 14:48:32 -0800 Subject: [PATCH 134/143] mm: clean up for early_pfn_to_nid() What's happening is that the assertion in mm/page_alloc.c:move_freepages() is triggering: BUG_ON(page_zone(start_page) != page_zone(end_page)); Once I knew this is what was happening, I added some annotations: if (unlikely(page_zone(start_page) != page_zone(end_page))) { printk(KERN_ERR "move_freepages: Bogus zones: " "start_page[%p] end_page[%p] zone[%p]\n", start_page, end_page, zone); printk(KERN_ERR "move_freepages: " "start_zone[%p] end_zone[%p]\n", page_zone(start_page), page_zone(end_page)); printk(KERN_ERR "move_freepages: " "start_pfn[0x%lx] end_pfn[0x%lx]\n", page_to_pfn(start_page), page_to_pfn(end_page)); printk(KERN_ERR "move_freepages: " "start_nid[%d] end_nid[%d]\n", page_to_nid(start_page), page_to_nid(end_page)); ... And here's what I got: move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00] move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00] move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff] move_freepages: start_nid[1] end_nid[0] My memory layout on this box is: [ 0.000000] Zone PFN ranges: [ 0.000000] Normal 0x00000000 -> 0x0081ff5d [ 0.000000] Movable zone start PFN for each node [ 0.000000] early_node_map[8] active PFN ranges [ 0.000000] 0: 0x00000000 -> 0x00020000 [ 0.000000] 1: 0x00800000 -> 0x0081f7ff [ 0.000000] 1: 0x0081f800 -> 0x0081fe50 [ 0.000000] 1: 0x0081fed1 -> 0x0081fed8 [ 0.000000] 1: 0x0081feda -> 0x0081fedb [ 0.000000] 1: 0x0081fedd -> 0x0081fee5 [ 0.000000] 1: 0x0081fee7 -> 0x0081ff51 [ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d So it's a block move in that 0x81f600-->0x81f7ff region which triggers the problem. This patch: Declaration of early_pfn_to_nid() is scattered over per-arch include files, and it seems it's complicated to know when the declaration is used. I think it makes fix-for-memmap-init not easy. This patch moves all declaration to include/linux/mm.h After this, if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use static definition in include/linux/mm.h else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use generic definition in mm/page_alloc.c else -> per-arch back end function will be called. Signed-off-by: KAMEZAWA Hiroyuki Tested-by: KOSAKI Motohiro Reported-by: David Miller Cc: Mel Gorman Cc: Heiko Carstens Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/mmzone.h | 4 ---- arch/ia64/mm/numa.c | 2 +- arch/x86/include/asm/mmzone_32.h | 2 -- arch/x86/include/asm/mmzone_64.h | 2 -- arch/x86/mm/numa_64.c | 2 +- include/linux/mm.h | 19 ++++++++++++++++--- mm/page_alloc.c | 8 +++++++- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/arch/ia64/include/asm/mmzone.h b/arch/ia64/include/asm/mmzone.h index 34efe88eb849..f2ca32069b3f 100644 --- a/arch/ia64/include/asm/mmzone.h +++ b/arch/ia64/include/asm/mmzone.h @@ -31,10 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -extern int early_pfn_to_nid(unsigned long pfn); -#endif - #ifdef CONFIG_IA64_DIG /* DIG systems are small */ # define MAX_PHYSNODE_ID 8 # define NR_NODE_MEMBLKS (MAX_NUMNODES * 8) diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index b73bf1838e57..5061c3fb6796 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -58,7 +58,7 @@ paddr_to_nid(unsigned long paddr) * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where * the section resides. */ -int early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 07f1af494ca5..105fb90a0635 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -32,8 +32,6 @@ static inline void get_memcfg_numa(void) get_memcfg_numa_flat(); } -extern int early_pfn_to_nid(unsigned long pfn); - extern void resume_map_numa_kva(pgd_t *pgd); #else /* !CONFIG_NUMA */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a5b3817d4b9e..a29f48c2a322 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -40,8 +40,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) -extern int early_pfn_to_nid(unsigned long pfn); - #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f89f89e..f3516da035d1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -145,7 +145,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes, return shift; } -int early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { return phys_to_nid(pfn << PAGE_SHIFT); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 10074212a35b..065cdf8c09fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1041,10 +1041,23 @@ extern void free_bootmem_with_active_regions(int nid, typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -extern int early_pfn_to_nid(unsigned long pfn); -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ + !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) +static inline int __early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} +#else +/* please see mm/page_alloc.c */ +extern int __meminit early_pfn_to_nid(unsigned long pfn); +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* there is a per-arch backend function. */ +extern int __meminit __early_pfn_to_nid(unsigned long pfn); +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +#endif + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b3073854..c5dd74602efc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2989,7 +2989,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __meminit early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i; @@ -3005,6 +3005,12 @@ int __meminit early_pfn_to_nid(unsigned long pfn) } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + return __early_pfn_to_nid(pfn); +} + + /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ for (i = first_active_region_index_in_nid(nid); i != -1; \ From cc2559bccc72767cb446f79b071d96c30c26439b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 18 Feb 2009 14:48:33 -0800 Subject: [PATCH 135/143] mm: fix memmap init for handling memory hole Now, early_pfn_in_nid(PFN, NID) may returns false if PFN is a hole. and memmap initialization was not done. This was a trouble for sparc boot. To fix this, the PFN should be initialized and marked as PG_reserved. This patch changes early_pfn_in_nid() return true if PFN is a hole. Signed-off-by: KAMEZAWA Hiroyuki Reported-by: David Miller Tested-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Heiko Carstens Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/numa.c | 2 +- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 23 ++++++++++++++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index 5061c3fb6796..3efea7d0a351 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -70,7 +70,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) return node_memblk[i].nid; } - return 0; + return -1; } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 09c14e213b63..1aca6cebbb78 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1071,7 +1071,7 @@ void sparse_init(void); #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_NODES_SPAN_OTHER_NODES -#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +bool early_pfn_in_nid(unsigned long pfn, int nid); #else #define early_pfn_in_nid(pfn, nid) (1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5dd74602efc..5c44ed49ca93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3000,16 +3000,33 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) if (start_pfn <= pfn && pfn < end_pfn) return early_node_map[i].nid; } - - return 0; + /* This is a memory hole */ + return -1; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { - return __early_pfn_to_nid(pfn); + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; } +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ From ffa7525c13eb3db0fd19a3e1cffe2ce6f561f5f3 Mon Sep 17 00:00:00 2001 From: Adam Lackorzynski Date: Wed, 18 Feb 2009 14:48:34 -0800 Subject: [PATCH 136/143] jsm: additional device support I have a Digi Neo 8 PCI card (114f:00b1) Serial controller: Digi International Digi Neo 8 (rev 05) that works with the jsm driver after using the following patch. Signed-off-by: Adam Lackorzynski Cc: Scott H Kilau Cc: Wendy Xiong Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/jsm/jsm_driver.c | 3 +++ include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/serial/jsm/jsm_driver.c b/drivers/serial/jsm/jsm_driver.c index 92187e28608a..ac79cbe4c2cf 100644 --- a/drivers/serial/jsm/jsm_driver.c +++ b/drivers/serial/jsm/jsm_driver.c @@ -84,6 +84,8 @@ static int jsm_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) brd->pci_dev = pdev; if (pdev->device == PCIE_DEVICE_ID_NEO_4_IBM) brd->maxports = 4; + else if (pdev->device == PCI_DEVICE_ID_DIGI_NEO_8) + brd->maxports = 8; else brd->maxports = 2; @@ -212,6 +214,7 @@ static struct pci_device_id jsm_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_NEO_2RJ45), 0, 0, 2 }, { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_NEO_2RJ45PRI), 0, 0, 3 }, { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_4_IBM), 0, 0, 4 }, + { PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_DIGI_NEO_8), 0, 0, 5 }, { 0, } }; MODULE_DEVICE_TABLE(pci, jsm_pci_tbl); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 918391b4b109..114b8192eab9 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1445,6 +1445,7 @@ #define PCI_DEVICE_ID_DIGI_DF_M_E 0x0071 #define PCI_DEVICE_ID_DIGI_DF_M_IOM2_A 0x0072 #define PCI_DEVICE_ID_DIGI_DF_M_A 0x0073 +#define PCI_DEVICE_ID_DIGI_NEO_8 0x00B1 #define PCI_DEVICE_ID_NEO_2DB9 0x00C8 #define PCI_DEVICE_ID_NEO_2DB9PRI 0x00C9 #define PCI_DEVICE_ID_NEO_2RJ45 0x00CA From 5a74db06cc8d36a325913aa4968ae169f997a466 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Wed, 18 Feb 2009 14:48:36 -0800 Subject: [PATCH 137/143] floppy: request and release only the ports we actually use The floppy driver requests an I/O port it doesn't need, and sometimes this causes a conflict with a motherboard device reported by PNPBIOS. This patch makes the floppy driver request and release only the ports it actually uses. It also factors out the request/release stuff and the io-ports list so they're all in one place now. The current floppy driver uses only these ports: 0x3f2 (FD_DOR) 0x3f4 (FD_STATUS) 0x3f5 (FD_DATA) 0x3f7 (FD_DCR/FD_DIR) but it requests 0x3f2-0x3f5 and 0x3f7, which includes the unused port 0x3f3. Some BIOSes report 0x3f3 as a motherboard resource. The PNP system driver reserves that, which causes a conflict when the floppy driver requests 0x3f2-0x3f5 later. Philippe reported that this conflict broke the floppy driver between 2.6.11 and 2.6.22. His PNPBIOS reports these devices: $ cat 00:07/id 00:07/resources # motherboard device PNP0c02 state = active io 0x80-0x80 io 0x10-0x1f io 0x22-0x3f io 0x44-0x5f io 0x90-0x9f io 0xa2-0xbf io 0x3f0-0x3f1 io 0x3f3-0x3f3 $ cat 00:03/id 00:03/resources # floppy device PNP0700 state = active io 0x3f4-0x3f5 io 0x3f2-0x3f2 Reference: http://lkml.org/lkml/2009/1/31/162 Signed-off-by: Bjorn Helgaas Signed-off-by: Philippe De Muyter Reported-by: Philippe De Muyter Tested-by: Philippe De Muyter Cc: Adam M Belay Cc: Robert Hancock Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/floppy.c | 79 +++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cf29cc4e6ab7..83d8ed39433d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -558,6 +558,8 @@ static void process_fd_request(void); static void recalibrate_floppy(void); static void floppy_shutdown(unsigned long); +static int floppy_request_regions(int); +static void floppy_release_regions(int); static int floppy_grab_irq_and_dma(void); static void floppy_release_irq_and_dma(void); @@ -4274,8 +4276,7 @@ static int __init floppy_init(void) FDCS->rawcmd = 2; if (user_reset_fdc(-1, FD_RESET_ALWAYS, 0)) { /* free ioports reserved by floppy_grab_irq_and_dma() */ - release_region(FDCS->address + 2, 4); - release_region(FDCS->address + 7, 1); + floppy_release_regions(fdc); FDCS->address = -1; FDCS->version = FDC_NONE; continue; @@ -4284,8 +4285,7 @@ static int __init floppy_init(void) FDCS->version = get_fdc_version(); if (FDCS->version == FDC_NONE) { /* free ioports reserved by floppy_grab_irq_and_dma() */ - release_region(FDCS->address + 2, 4); - release_region(FDCS->address + 7, 1); + floppy_release_regions(fdc); FDCS->address = -1; continue; } @@ -4358,6 +4358,47 @@ out_put_disk: static DEFINE_SPINLOCK(floppy_usage_lock); +static const struct io_region { + int offset; + int size; +} io_regions[] = { + { 2, 1 }, + /* address + 3 is sometimes reserved by pnp bios for motherboard */ + { 4, 2 }, + /* address + 6 is reserved, and may be taken by IDE. + * Unfortunately, Adaptec doesn't know this :-(, */ + { 7, 1 }, +}; + +static void floppy_release_allocated_regions(int fdc, const struct io_region *p) +{ + while (p != io_regions) { + p--; + release_region(FDCS->address + p->offset, p->size); + } +} + +#define ARRAY_END(X) (&((X)[ARRAY_SIZE(X)])) + +static int floppy_request_regions(int fdc) +{ + const struct io_region *p; + + for (p = io_regions; p < ARRAY_END(io_regions); p++) { + if (!request_region(FDCS->address + p->offset, p->size, "floppy")) { + DPRINT("Floppy io-port 0x%04lx in use\n", FDCS->address + p->offset); + floppy_release_allocated_regions(fdc, p); + return -EBUSY; + } + } + return 0; +} + +static void floppy_release_regions(int fdc) +{ + floppy_release_allocated_regions(fdc, ARRAY_END(io_regions)); +} + static int floppy_grab_irq_and_dma(void) { unsigned long flags; @@ -4399,18 +4440,8 @@ static int floppy_grab_irq_and_dma(void) for (fdc = 0; fdc < N_FDC; fdc++) { if (FDCS->address != -1) { - if (!request_region(FDCS->address + 2, 4, "floppy")) { - DPRINT("Floppy io-port 0x%04lx in use\n", - FDCS->address + 2); - goto cleanup1; - } - if (!request_region(FDCS->address + 7, 1, "floppy DIR")) { - DPRINT("Floppy io-port 0x%04lx in use\n", - FDCS->address + 7); - goto cleanup2; - } - /* address + 6 is reserved, and may be taken by IDE. - * Unfortunately, Adaptec doesn't know this :-(, */ + if (floppy_request_regions(fdc)) + goto cleanup; } } for (fdc = 0; fdc < N_FDC; fdc++) { @@ -4432,15 +4463,11 @@ static int floppy_grab_irq_and_dma(void) fdc = 0; irqdma_allocated = 1; return 0; -cleanup2: - release_region(FDCS->address + 2, 4); -cleanup1: +cleanup: fd_free_irq(); fd_free_dma(); - while (--fdc >= 0) { - release_region(FDCS->address + 2, 4); - release_region(FDCS->address + 7, 1); - } + while (--fdc >= 0) + floppy_release_regions(fdc); spin_lock_irqsave(&floppy_usage_lock, flags); usage_count--; spin_unlock_irqrestore(&floppy_usage_lock, flags); @@ -4501,10 +4528,8 @@ static void floppy_release_irq_and_dma(void) #endif old_fdc = fdc; for (fdc = 0; fdc < N_FDC; fdc++) - if (FDCS->address != -1) { - release_region(FDCS->address + 2, 4); - release_region(FDCS->address + 7, 1); - } + if (FDCS->address != -1) + floppy_release_regions(fdc); fdc = old_fdc; } From a1a5c3b9237662f326cc730e167e7524b5d05a36 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Wed, 18 Feb 2009 14:48:38 -0800 Subject: [PATCH 138/143] fbdev/drm: fix Kconfig submenu mess in "Graphics support" Submenus of the graphics support "Support for frame buffer devices" and "Direct Rendering Manager (XFree86 4.1.0 and higher DRI support)" are broken in half after latest changes for Intel 915 mode setting support. The DRM subsection is broken because one option is put outside the choice section it depends on. The frame buffers part is broken then due to circular dependency. Fix this by make Intel frame buffers depend on CONFIG_INTEL_AGP. Kconfigs are broken by d2f59357700487a8b944f4f7777d1e97cf5ea2ed ("drm/i915: select framebuffer support automatically"). This is probably not only way to fix this. Signed-off-by: Krzysztof Helt Cc: Ingo Molnar Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/Kconfig | 13 ++++++------- drivers/video/Kconfig | 10 ++-------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 4be3acbaaf9a..3a22eb9be378 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -80,18 +80,17 @@ config DRM_I915 XFree86 4.4 and above. If unsure, build this and i830 as modules and the X server will load the correct one. -endchoice - config DRM_I915_KMS bool "Enable modesetting on intel by default" depends on DRM_I915 help - Choose this option if you want kernel modesetting enabled by default, - and you have a new enough userspace to support this. Running old - userspaces with this enabled will cause pain. Note that this causes - the driver to bind to PCI devices, which precludes loading things - like intelfb. + Choose this option if you want kernel modesetting enabled by default, + and you have a new enough userspace to support this. Running old + userspaces with this enabled will cause pain. Note that this causes + the driver to bind to PCI devices, which precludes loading things + like intelfb. +endchoice config DRM_MGA tristate "Matrox g200/g400" diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index bf0af660df8a..fb19803060cf 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -1054,10 +1054,7 @@ config FB_RIVA_BACKLIGHT config FB_I810 tristate "Intel 810/815 support (EXPERIMENTAL)" - depends on EXPERIMENTAL && PCI && X86_32 - select AGP - select AGP_INTEL - select FB + depends on EXPERIMENTAL && FB && PCI && X86_32 && AGP_INTEL select FB_MODE_HELPERS select FB_CFB_FILLRECT select FB_CFB_COPYAREA @@ -1120,10 +1117,7 @@ config FB_CARILLO_RANCH config FB_INTEL tristate "Intel 830M/845G/852GM/855GM/865G/915G/945G/945GM/965G/965GM support (EXPERIMENTAL)" - depends on EXPERIMENTAL && PCI && X86 - select FB - select AGP - select AGP_INTEL + depends on EXPERIMENTAL && FB && PCI && X86 && AGP_INTEL select FB_MODE_HELPERS select FB_CFB_FILLRECT select FB_CFB_COPYAREA From 2db69a9340da12a4db44edb7506dd68799aeff55 Mon Sep 17 00:00:00 2001 From: Bill Nottingham Date: Wed, 18 Feb 2009 14:48:39 -0800 Subject: [PATCH 139/143] vt: Declare PIO_CMAP/GIO_CMAP as compatbile ioctls. Otherwise, these don't work when called from 32-bit userspace on 64-bit kernels. Cc: Jiri Kosina Cc: Alan Cox Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 9c6d815dd191..39bd4d38e889 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1938,6 +1938,8 @@ ULONG_IOCTL(SET_BITMAP_FILE) /* Big K */ COMPATIBLE_IOCTL(PIO_FONT) COMPATIBLE_IOCTL(GIO_FONT) +COMPATIBLE_IOCTL(PIO_CMAP) +COMPATIBLE_IOCTL(GIO_CMAP) ULONG_IOCTL(KDSIGACCEPT) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) From 310d8c93f9f07499cd7ca82d7997774a89de00e7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 18 Feb 2009 14:48:39 -0800 Subject: [PATCH 140/143] x86: dell-laptop: depends on POWER_SUPPLY Build breaks when DELL_LAPTOP=y and POWER_SUPPLY=m. DELL_LAPTOP needs to depend on POWER_SUPPLY. dell-laptop.c:(.text+0x1ef3c4): undefined reference to `power_supply_is_system_supplied' dell-laptop.c:(.text+0x1ef45e): undefined reference to `power_supply_is_system_supplied' Signed-off-by: Randy Dunlap Cc: Matthew Garrett Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 6bce56c22623..b3866ad50227 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -62,6 +62,7 @@ config DELL_LAPTOP depends on EXPERIMENTAL depends on BACKLIGHT_CLASS_DEVICE depends on RFKILL + depends on POWER_SUPPLY default n ---help--- This driver adds support for rfkill and backlight control to Dell From 97bef7dd05563807539122c488a5dd93ed327722 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Wed, 18 Feb 2009 14:48:40 -0800 Subject: [PATCH 141/143] Bernhard has moved Since I don't work for SUSE any more and the bwalle@suse.de address is invalid, correct it in the copyright headers and documentation. Signed-off-by: Bernhard Walle Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-firmware-memmap | 2 +- drivers/firmware/memmap.c | 2 +- include/linux/firmware-map.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap index 0d99ee6ae02e..eca0d65087dc 100644 --- a/Documentation/ABI/testing/sysfs-firmware-memmap +++ b/Documentation/ABI/testing/sysfs-firmware-memmap @@ -1,6 +1,6 @@ What: /sys/firmware/memmap/ Date: June 2008 -Contact: Bernhard Walle +Contact: Bernhard Walle Description: On all platforms, the firmware provides a memory map which the kernel reads. The resources from that memory map are registered diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 261b9aa3f248..05aa2d406ac6 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -1,7 +1,7 @@ /* * linux/drivers/firmware/memmap.c * Copyright (C) 2008 SUSE LINUX Products GmbH - * by Bernhard Walle + * by Bernhard Walle * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License v2.0 as published by diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h index 6e199c8dfacc..cca686b39123 100644 --- a/include/linux/firmware-map.h +++ b/include/linux/firmware-map.h @@ -1,7 +1,7 @@ /* * include/linux/firmware-map.h: * Copyright (C) 2008 SUSE LINUX Products GmbH - * by Bernhard Walle + * by Bernhard Walle * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License v2.0 as published by From be50344e604f956891fc0013f1ba78823a758124 Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Wed, 18 Feb 2009 14:48:41 -0800 Subject: [PATCH 142/143] spi-gpio: sanitize MISO bitvalue gpio_get_value() returns 0 or nonzero, but getmiso() expects 0 or 1. Sanitize the value to a 0/1 boolean. Signed-off-by: Michael Buesch Acked-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/spi/spi_gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi_gpio.c b/drivers/spi/spi_gpio.c index 49698cabc30d..f5ed9721aabb 100644 --- a/drivers/spi/spi_gpio.c +++ b/drivers/spi/spi_gpio.c @@ -114,7 +114,7 @@ static inline void setmosi(const struct spi_device *spi, int is_on) static inline int getmiso(const struct spi_device *spi) { - return gpio_get_value(SPI_MISO_GPIO); + return !!gpio_get_value(SPI_MISO_GPIO); } #undef pdata From f04b30de3c82528f1ab4c58b3dd4c975f5341901 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Feb 2009 14:48:43 -0800 Subject: [PATCH 143/143] inotify: fix GFP_KERNEL related deadlock Enhanced lockdep coverage of __GFP_NOFS turned up this new lockdep assert: [ 1093.677775] [ 1093.677781] ================================= [ 1093.680031] [ INFO: inconsistent lock state ] [ 1093.680031] 2.6.29-rc5-tip-01504-gb49eca1-dirty #1 [ 1093.680031] --------------------------------- [ 1093.680031] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. [ 1093.680031] kswapd0/308 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 1093.680031] (&inode->inotify_mutex){+.+.?.}, at: [] inotify_inode_is_dead+0x20/0x80 [ 1093.680031] {RECLAIM_FS-ON-W} state was registered at: [ 1093.680031] [] mark_held_locks+0x43/0x5b [ 1093.680031] [] lockdep_trace_alloc+0x6c/0x6e [ 1093.680031] [] kmem_cache_alloc+0x20/0x150 [ 1093.680031] [] idr_pre_get+0x27/0x6c [ 1093.680031] [] inotify_handle_get_wd+0x25/0xad [ 1093.680031] [] inotify_add_watch+0x7a/0x129 [ 1093.680031] [] sys_inotify_add_watch+0x20f/0x250 [ 1093.680031] [] sysenter_do_call+0x12/0x35 [ 1093.680031] [] 0xffffffff [ 1093.680031] irq event stamp: 60417 [ 1093.680031] hardirqs last enabled at (60417): [] call_rcu+0x53/0x59 [ 1093.680031] hardirqs last disabled at (60416): [] call_rcu+0x17/0x59 [ 1093.680031] softirqs last enabled at (59656): [] __do_softirq+0x157/0x16b [ 1093.680031] softirqs last disabled at (59651): [] do_softirq+0x74/0x15d [ 1093.680031] [ 1093.680031] other info that might help us debug this: [ 1093.680031] 2 locks held by kswapd0/308: [ 1093.680031] #0: (shrinker_rwsem){++++..}, at: [] shrink_slab+0x36/0x189 [ 1093.680031] #1: (&type->s_umount_key#4){+++++.}, at: [] shrink_dcache_memory+0x110/0x1fb [ 1093.680031] [ 1093.680031] stack backtrace: [ 1093.680031] Pid: 308, comm: kswapd0 Not tainted 2.6.29-rc5-tip-01504-gb49eca1-dirty #1 [ 1093.680031] Call Trace: [ 1093.680031] [] valid_state+0x12a/0x13d [ 1093.680031] [] mark_lock+0xc1/0x1e9 [ 1093.680031] [] ? check_usage_forwards+0x0/0x3f [ 1093.680031] [] __lock_acquire+0x2c6/0xac8 [ 1093.680031] [] ? register_lock_class+0x17/0x228 [ 1093.680031] [] lock_acquire+0x5d/0x7a [ 1093.680031] [] ? inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] __mutex_lock_common+0x3a/0x4cb [ 1093.680031] [] ? inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] mutex_lock_nested+0x2e/0x36 [ 1093.680031] [] ? inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] dentry_iput+0x90/0xc2 [ 1093.680031] [] d_kill+0x21/0x45 [ 1093.680031] [] __shrink_dcache_sb+0x27f/0x355 [ 1093.680031] [] shrink_dcache_memory+0x15e/0x1fb [ 1093.680031] [] shrink_slab+0x121/0x189 [ 1093.680031] [] kswapd+0x39f/0x561 [ 1093.680031] [] ? isolate_pages_global+0x0/0x233 [ 1093.680031] [] ? autoremove_wake_function+0x0/0x43 [ 1093.680031] [] ? kswapd+0x0/0x561 [ 1093.680031] [] kthread+0x41/0x82 [ 1093.680031] [] ? kthread+0x0/0x82 [ 1093.680031] [] kernel_thread_helper+0x7/0x10 inotify_handle_get_wd() does idr_pre_get() which does a kmem_cache_alloc() without __GFP_FS - and is hence deadlockable under extreme MM pressure. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: MinChan Kim Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index dae3f28f30d4..331f2e88e284 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih, int ret; do { - if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL))) + if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS))) return -ENOSPC; ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd); } while (ret == -EAGAIN);