From 2abbbb63c90ab55ca3f054772c2e5ba7df810c48 Mon Sep 17 00:00:00 2001 From: Gerhard Sittig Date: Tue, 14 May 2013 04:40:53 +0000 Subject: [PATCH 001/156] powerpc/mpc512x: move common code to shared.c file - implement all of the init, init early, and setup arch routines in the shared source file for the MPC512x PowerPC platform, and make all MPC512x based boards (ADS, PDM, generic) use those common routines - remove declarations from header files for routines which aren't referenced from external callers any longer this modification concentrates knowledge about the optional FSL DIU support in one spot within the shared code, and makes all boards benefit transparently from future improvements in the shared platform code the change does not modify any behaviour but preserves all code paths Signed-off-by: Gerhard Sittig Signed-off-by: Anatolij Gustschin --- arch/powerpc/include/asm/mpc5121.h | 1 - arch/powerpc/platforms/512x/mpc5121_ads.c | 6 ++---- arch/powerpc/platforms/512x/mpc512x.h | 11 ++--------- arch/powerpc/platforms/512x/mpc512x_generic.c | 4 ++-- arch/powerpc/platforms/512x/mpc512x_shared.c | 14 +++++++++++++- arch/powerpc/platforms/512x/pdm360ng.c | 4 ++-- 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/mpc5121.h b/arch/powerpc/include/asm/mpc5121.h index 885c040d6194..8ae133eaf9fa 100644 --- a/arch/powerpc/include/asm/mpc5121.h +++ b/arch/powerpc/include/asm/mpc5121.h @@ -68,6 +68,5 @@ struct mpc512x_lpc { }; int mpc512x_cs_config(unsigned int cs, u32 val); -int __init mpc5121_clk_init(void); #endif /* __ASM_POWERPC_MPC5121_H__ */ diff --git a/arch/powerpc/platforms/512x/mpc5121_ads.c b/arch/powerpc/platforms/512x/mpc5121_ads.c index 0a134e0469ef..3e90ece10ae9 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads.c @@ -43,9 +43,7 @@ static void __init mpc5121_ads_setup_arch(void) mpc83xx_add_bridge(np); #endif -#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE) - mpc512x_setup_diu(); -#endif + mpc512x_setup_arch(); } static void __init mpc5121_ads_init_IRQ(void) @@ -69,7 +67,7 @@ define_machine(mpc5121_ads) { .probe = mpc5121_ads_probe, .setup_arch = mpc5121_ads_setup_arch, .init = mpc512x_init, - .init_early = mpc512x_init_diu, + .init_early = mpc512x_init_early, .init_IRQ = mpc5121_ads_init_IRQ, .get_irq = ipic_get_irq, .calibrate_decr = generic_calibrate_decr, diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h index 0a8e60023944..fdb4303246a0 100644 --- a/arch/powerpc/platforms/512x/mpc512x.h +++ b/arch/powerpc/platforms/512x/mpc512x.h @@ -12,18 +12,11 @@ #ifndef __MPC512X_H__ #define __MPC512X_H__ extern void __init mpc512x_init_IRQ(void); +extern void __init mpc512x_init_early(void); extern void __init mpc512x_init(void); +extern void __init mpc512x_setup_arch(void); extern int __init mpc5121_clk_init(void); -void __init mpc512x_declare_of_platform_devices(void); extern const char *mpc512x_select_psc_compat(void); extern void mpc512x_restart(char *cmd); -#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE) -void mpc512x_init_diu(void); -void mpc512x_setup_diu(void); -#else -#define mpc512x_init_diu NULL -#define mpc512x_setup_diu NULL -#endif - #endif /* __MPC512X_H__ */ diff --git a/arch/powerpc/platforms/512x/mpc512x_generic.c b/arch/powerpc/platforms/512x/mpc512x_generic.c index 5fb919b30924..ce71408781a0 100644 --- a/arch/powerpc/platforms/512x/mpc512x_generic.c +++ b/arch/powerpc/platforms/512x/mpc512x_generic.c @@ -45,8 +45,8 @@ define_machine(mpc512x_generic) { .name = "MPC512x generic", .probe = mpc512x_generic_probe, .init = mpc512x_init, - .init_early = mpc512x_init_diu, - .setup_arch = mpc512x_setup_diu, + .init_early = mpc512x_init_early, + .setup_arch = mpc512x_setup_arch, .init_IRQ = mpc512x_init_IRQ, .get_irq = ipic_get_irq, .calibrate_decr = generic_calibrate_decr, diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 6eb94ab99d39..09622d3323d7 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -58,7 +58,7 @@ void mpc512x_restart(char *cmd) ; } -#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE) +#if IS_ENABLED(CONFIG_FB_FSL_DIU) struct fsl_diu_shared_fb { u8 gamma[0x300]; /* 32-bit aligned! */ @@ -436,6 +436,12 @@ void __init mpc512x_psc_fifo_init(void) } } +void __init mpc512x_init_early(void) +{ + if (IS_ENABLED(CONFIG_FB_FSL_DIU)) + mpc512x_init_diu(); +} + void __init mpc512x_init(void) { mpc5121_clk_init(); @@ -444,6 +450,12 @@ void __init mpc512x_init(void) mpc512x_psc_fifo_init(); } +void __init mpc512x_setup_arch(void) +{ + if (IS_ENABLED(CONFIG_FB_FSL_DIU)) + mpc512x_setup_diu(); +} + /** * mpc512x_cs_config - Setup chip select configuration * @cs: chip select number diff --git a/arch/powerpc/platforms/512x/pdm360ng.c b/arch/powerpc/platforms/512x/pdm360ng.c index 0575e858291c..24b314d7bd5f 100644 --- a/arch/powerpc/platforms/512x/pdm360ng.c +++ b/arch/powerpc/platforms/512x/pdm360ng.c @@ -119,9 +119,9 @@ static int __init pdm360ng_probe(void) define_machine(pdm360ng) { .name = "PDM360NG", .probe = pdm360ng_probe, - .setup_arch = mpc512x_setup_diu, + .setup_arch = mpc512x_setup_arch, .init = pdm360ng_init, - .init_early = mpc512x_init_diu, + .init_early = mpc512x_init_early, .init_IRQ = mpc512x_init_IRQ, .get_irq = ipic_get_irq, .calibrate_decr = generic_calibrate_decr, From a4f4124cf308275b4a2219d1e332dfc01d8bd6b7 Mon Sep 17 00:00:00 2001 From: Gerhard Sittig Date: Tue, 14 May 2013 04:40:54 +0000 Subject: [PATCH 002/156] powerpc/mpc512x: initialize board restart earlier move the MPC512x restart initialization from the shared init routine to the shared init_early routine recent problems in the proc(5) filesystem initialization led to the situation where the platform's restart routine was invoked yet the registers required for software reset were not yet available, which made the board hang instead of reboot Signed-off-by: Gerhard Sittig Signed-off-by: Anatolij Gustschin --- arch/powerpc/platforms/512x/mpc512x_shared.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 09622d3323d7..a8b5110eb298 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -438,6 +438,7 @@ void __init mpc512x_psc_fifo_init(void) void __init mpc512x_init_early(void) { + mpc512x_restart_init(); if (IS_ENABLED(CONFIG_FB_FSL_DIU)) mpc512x_init_diu(); } @@ -446,7 +447,6 @@ void __init mpc512x_init(void) { mpc5121_clk_init(); mpc512x_declare_of_platform_devices(); - mpc512x_restart_init(); mpc512x_psc_fifo_init(); } From 8663890a9e9278623d20c67aa9fbeeb31ff3be97 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jun 2013 00:20:34 -0700 Subject: [PATCH 003/156] mm/thp: use the correct function when updating access flags We should use pmdp_set_access_flags to update access flags. Archs like powerpc use extra checks(_PAGE_BUSY) when updating a hugepage PTE. A set_pmd_at doesn't do those checks. We should use set_pmd_at only when updating a none hugepage PTE. Signed-off-by: Aneesh Kumar K.V Cc: Andrea Arcangeli a Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- mm/huge_memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329b83fe..dab90fd67298 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { From 6b0b50b0617fad5f2af3b928596a25f7de8dbf50 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 5 Jun 2013 17:14:02 -0700 Subject: [PATCH 004/156] mm/THP: add pmd args to pgtable deposit and withdraw APIs This will be later used by powerpc THP support. In powerpc we want to use pgtable for storing the hash index values. So instead of adding them to mm_context list, we would like to store them in the second half of pmd Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrea Arcangeli Reviewed-by: David Gibson Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- arch/s390/include/asm/pgtable.h | 5 +++-- arch/s390/mm/pgtable.c | 5 +++-- arch/sparc/include/asm/pgtable_64.h | 5 +++-- arch/sparc/mm/tlb.c | 5 +++-- include/asm-generic/pgtable.h | 5 +++-- mm/huge_memory.c | 18 +++++++++--------- mm/pgtable-generic.c | 5 +++-- 7 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index e8b6e5b8932c..2080dfeba64b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1370,10 +1370,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); static inline int pmd_trans_splitting(pmd_t pmd) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index a938b548f07e..1ccbffecc4d5 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1117,7 +1117,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { struct list_head *lh = (struct list_head *) pgtable; @@ -1131,7 +1132,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) mm->pmd_huge_pte = pgtable; } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { struct list_head *lh; pgtable_t pgtable; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 7619f2f792af..d22b92d67844 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -853,10 +853,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif /* Encode and de-code a swap entry */ diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 83d89bcb44af..f828dd33551c 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -188,7 +188,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { struct list_head *lh = (struct list_head *) pgtable; @@ -202,7 +203,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) mm->pmd_huge_pte = pgtable; } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { struct list_head *lh; pgtable_t pgtable; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a59ff51b0166..18e27c210716 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -173,11 +173,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dab90fd67298..6b785e17b679 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -730,7 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma); page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -772,7 +772,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); mm->nr_ptes++; return true; } @@ -917,7 +917,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - pgtable_trans_huge_deposit(dst_mm, pgtable); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1360,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (is_huge_zero_pmd(orig_pmd)) { @@ -1693,7 +1693,7 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); haddr = address; @@ -2363,7 +2363,7 @@ static void collapse_huge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; @@ -2669,7 +2669,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c8323fe6c8f..e1a6e4fab016 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { assert_spin_locked(&mm->page_table_lock); @@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* no "address" argument so destroys page coloring of some arch */ -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; From a6bf2bb03e5bad7e9289d80ecb5faac11630c7ab Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 5 Jun 2013 17:14:04 -0700 Subject: [PATCH 005/156] mm/THP: withdraw the pgtable after pmdp related operations For architectures like ppc64 we look at deposited pgtable when calling pmdp_get_and_clear. So do the pgtable_trans_huge_withdraw after finishing pmdp related operations. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrea Arcangeli Cc: Andrea Arcangeli Cc: David Gibson Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- mm/huge_memory.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6b785e17b679..5c4fac2d239e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1360,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); From fde52796d487b675cde55427e3347ff3e59f9a7f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 5 Jun 2013 17:14:05 -0700 Subject: [PATCH 006/156] mm/THP: don't use HPAGE_SHIFT in transparent hugepage code For architectures like powerpc that support multiple explicit hugepage sizes, HPAGE_SHIFT indicate the default explicit hugepage shift. For THP to work the hugepage size should be same as PMD_SIZE. So use PMD_SHIFT directly. So move the define outside CONFIG_TRANSPARENT_HUGEPAGE #ifdef because we want to use these defines in generic code with if (pmd_trans_huge()) conditional. Signed-off-by: Aneesh Kumar K.V Cc: Andrea Arcangeli Cc: David Gibson Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- include/linux/huge_mm.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 528454c2caa9..cc276d2c3a40 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -58,12 +58,11 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1< Date: Wed, 5 Jun 2013 17:14:06 -0700 Subject: [PATCH 007/156] mm/THP: deposit the transpare huge pgtable before set_pmd Architectures like powerpc use the deposited pgtable to store hash index values. We need to make the deposted pgtable is visible to other cpus before we are ready to take a hash fault. Signed-off-by: Aneesh Kumar K.V Cc: Andrea Arcangeli Cc: David Gibson Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- mm/huge_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5c4fac2d239e..59d9384b6bbf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pmd_t entry; entry = mk_huge_pmd(page, vma); page_add_new_anon_rmap(page, vma, haddr); - set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); - set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); mm->nr_ptes++; return true; } @@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); - set_pmd_at(dst_mm, addr, dst_pmd, pmd); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); dst_mm->nr_ptes++; ret = 0; @@ -2367,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pmd, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; From e80034047bee9ceacfc1bfff873ebfdd049817ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Feb 2013 23:38:51 +0100 Subject: [PATCH 008/156] powerpc: Mark low level irq handlers NO_THREAD These low level handlers cannot be threaded. Mark them NO_THREAD Reported-by: leroy christophe Tested-by: leroy christophe Signed-off-by: Thomas Gleixner Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/8xx/m8xx_setup.c | 1 + arch/powerpc/sysdev/cpm1.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index 1e121088826f..806cbbd86ec6 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -43,6 +43,7 @@ static irqreturn_t timebase_interrupt(int irq, void *dev) static struct irqaction tbint_irqaction = { .handler = timebase_interrupt, + .flags = IRQF_NO_THREAD, .name = "tbint", }; diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c index d4fa03f2b6ac..5e6ff38ea69f 100644 --- a/arch/powerpc/sysdev/cpm1.c +++ b/arch/powerpc/sysdev/cpm1.c @@ -120,6 +120,7 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev) static struct irqaction cpm_error_irqaction = { .handler = cpm_error_interrupt, + .flags = IRQF_NO_THREAD, .name = "error", }; From 85f395c5b0a26b3a80f9e2d35333981a2a75c0ae Mon Sep 17 00:00:00 2001 From: "Suzuki K. Poulose" Date: Mon, 3 Dec 2012 20:37:42 +0530 Subject: [PATCH 009/156] powerpc/kprobes: Do not disable External interrupts during single step External/Decrement exceptions have lower priority than the Debug Exception. So, we don't have to disable the External interrupts before a single step. However, on BookE, Critical Input Exception(CE) has higher priority than a Debug Exception. Hence we mask them. Signed-off-by: Suzuki K. Poulose Cc: Sebastian Andrzej Siewior Cc: Ananth N Mavinakaynahalli Cc: Kumar Gala Cc: linuxppc-dev@ozlabs.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/kprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 11f5b03a0b06..560f430da478 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -104,13 +104,13 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { - /* We turn off async exceptions to ensure that the single step will - * be for the instruction we have the kprobe on, if we dont its - * possible we'd get the single step reported for an exception handler - * like Decrementer or External Interrupt */ - regs->msr &= ~MSR_EE; regs->msr |= MSR_SINGLESTEP; #ifdef CONFIG_PPC_ADV_DEBUG_REGS + /* + * We turn off Critical Input Exception(CE) to ensure that the single + * step will be for the instruction we have the probe on; if we don't, + * it is possible we'd get the single step reported for CE. + */ regs->msr &= ~MSR_CE; mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); #ifdef CONFIG_PPC_47x From 35fd219a268cc82cef842518cd64ea6949629ba2 Mon Sep 17 00:00:00 2001 From: "Suzuki K. Poulose" Date: Mon, 3 Dec 2012 20:38:37 +0530 Subject: [PATCH 010/156] powerpc: Move the single step enable code to a generic path This patch moves the single step enable code used by kprobe to a generic routine header so that, it can be re-used by other code, in this case, uprobes. No functional changes. Signed-off-by: Suzuki K. Poulose Cc: Ananth N Mavinakaynahalli Cc: Kumar Gala Cc: linuxppc-dev@ozlabs.org Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/probes.h | 25 +++++++++++++++++++++++++ arch/powerpc/kernel/kprobes.c | 20 +------------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/probes.h b/arch/powerpc/include/asm/probes.h index 5f1e15b68704..3421637cfd7b 100644 --- a/arch/powerpc/include/asm/probes.h +++ b/arch/powerpc/include/asm/probes.h @@ -38,5 +38,30 @@ typedef u32 ppc_opcode_t; #define is_trap(instr) (IS_TW(instr) || IS_TWI(instr)) #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +#define MSR_SINGLESTEP (MSR_DE) +#else +#define MSR_SINGLESTEP (MSR_SE) +#endif + +/* Enable single stepping for the current task */ +static inline void enable_single_step(struct pt_regs *regs) +{ + regs->msr |= MSR_SINGLESTEP; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + /* + * We turn off Critical Input Exception(CE) to ensure that the single + * step will be for the instruction we have the probe on; if we don't, + * it is possible we'd get the single step reported for CE. + */ + regs->msr &= ~MSR_CE; + mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); +#ifdef CONFIG_PPC_47x + isync(); +#endif +#endif +} + + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_PROBES_H */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 560f430da478..2156ea90eb54 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -36,12 +36,6 @@ #include #include -#ifdef CONFIG_PPC_ADV_DEBUG_REGS -#define MSR_SINGLESTEP (MSR_DE) -#else -#define MSR_SINGLESTEP (MSR_SE) -#endif - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -104,19 +98,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { - regs->msr |= MSR_SINGLESTEP; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - /* - * We turn off Critical Input Exception(CE) to ensure that the single - * step will be for the instruction we have the probe on; if we don't, - * it is possible we'd get the single step reported for CE. - */ - regs->msr &= ~MSR_CE; - mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); -#ifdef CONFIG_PPC_47x - isync(); -#endif -#endif + enable_single_step(regs); /* * On powerpc we should single step on the original From 39a421ff0b7cb056e687894a9d5f57aa1303e1c8 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 20 Mar 2013 19:06:12 -0500 Subject: [PATCH 011/156] powerpc/mm/nohash: Ignore NULL stale_map entries This happens with threads that are offline due to CPU hotplug (including threads that were never "plugged in" to begin with because SMT is disabled). Signed-off-by: Scott Wood Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_context_nohash.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index e779642c25e5..810f8e4d74d4 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id) */ for_each_cpu(cpu, mm_cpumask(mm)) { for (i = cpu_first_thread_sibling(cpu); - i <= cpu_last_thread_sibling(cpu); i++) - __set_bit(id, stale_map[i]); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __set_bit(id, stale_map[i]); + } cpu = i - 1; } return id; @@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* XXX This clear should ultimately be part of local_flush_tlb_mm */ for (i = cpu_first_thread_sibling(cpu); i <= cpu_last_thread_sibling(cpu); i++) { - __clear_bit(id, stale_map[i]); + if (stale_map[i]) + __clear_bit(id, stale_map[i]); } } From 3139b0a797d6826519ed98a13623a92f12269613 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 17 Apr 2013 17:50:35 +0800 Subject: [PATCH 012/156] powerpc: Remove the unneeded trigger of decrementer interrupt in decrementer_check_overflow Previously in order to handle the edge sensitive decrementers, we choose to set the decrementer to 1 to trigger a decrementer interrupt when re-enabling interrupts. But with the rework of the lazy EE, we would replay the decrementer interrupt when re-enabling interrupts if a decrementer interrupt occurs with irq soft-disabled. So there is no need to trigger a decrementer interrupt in this case any more. Signed-off-by: Kevin Hao Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/irq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5cbcf4d5a808..32fa52e8163f 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -116,8 +116,6 @@ static inline notrace int decrementer_check_overflow(void) u64 now = get_tb_or_rtc(); u64 *next_tb = &__get_cpu_var(decrementers_next_tb); - if (now >= *next_tb) - set_dec(1); return now >= *next_tb; } From d5d8ec895ca599fbde43efe3a2f9714315e3d298 Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Tue, 23 Apr 2013 17:50:33 -0700 Subject: [PATCH 013/156] powerpc/mm: Make mmap_64.c compile on 32bit powerpc There appears to be no good reason to keep this as 64bit only. It works on 32bit also, and has checks so that it can work correctly with 32bit binaries on 64bit hardware which is why I think this works. I tested this on qemu using the virtex-ml507 machine type. Before, /bin2 # ./test & cat /proc/${!}/maps 00100000-00103000 r-xp 00000000 00:00 0 [vdso] 10000000-10007000 r-xp 00000000 00:01 454 /bin2/test 10017000-10018000 rw-p 00007000 00:01 454 /bin2/test 48000000-48020000 r-xp 00000000 00:01 224 /lib/ld-2.11.3.so 48021000-48023000 rw-p 00021000 00:01 224 /lib/ld-2.11.3.so bfd03000-bfd24000 rw-p 00000000 00:00 0 [stack] /bin2 # ./test & cat /proc/${!}/maps 00100000-00103000 r-xp 00000000 00:00 0 [vdso] 0fe6e000-0ffd8000 r-xp 00000000 00:01 214 /lib/libc-2.11.3.so 0ffd8000-0ffe8000 ---p 0016a000 00:01 214 /lib/libc-2.11.3.so 0ffe8000-0ffed000 rw-p 0016a000 00:01 214 /lib/libc-2.11.3.so 0ffed000-0fff0000 rw-p 00000000 00:00 0 10000000-10007000 r-xp 00000000 00:01 454 /bin2/test 10017000-10018000 rw-p 00007000 00:01 454 /bin2/test 48000000-48020000 r-xp 00000000 00:01 224 /lib/ld-2.11.3.so 48020000-48021000 rw-p 00000000 00:00 0 48021000-48023000 rw-p 00021000 00:01 224 /lib/ld-2.11.3.so bf98a000-bf9ab000 rw-p 00000000 00:00 0 [stack] /bin2 # ./test & cat /proc/${!}/maps 00100000-00103000 r-xp 00000000 00:00 0 [vdso] 0fe6e000-0ffd8000 r-xp 00000000 00:01 214 /lib/libc-2.11.3.so 0ffd8000-0ffe8000 ---p 0016a000 00:01 214 /lib/libc-2.11.3.so 0ffe8000-0ffed000 rw-p 0016a000 00:01 214 /lib/libc-2.11.3.so 0ffed000-0fff0000 rw-p 00000000 00:00 0 10000000-10007000 r-xp 00000000 00:01 454 /bin2/test 10017000-10018000 rw-p 00007000 00:01 454 /bin2/test 48000000-48020000 r-xp 00000000 00:01 224 /lib/ld-2.11.3.so 48020000-48021000 rw-p 00000000 00:00 0 48021000-48023000 rw-p 00021000 00:01 224 /lib/ld-2.11.3.so bfa54000-bfa75000 rw-p 00000000 00:00 0 [stack] After, bash-4.1# ./test & cat /proc/${!}/maps [7] 803 00100000-00103000 r-xp 00000000 00:00 0 [vdso] 10000000-10007000 r-xp 00000000 00:01 454 /bin2/test 10017000-10018000 rw-p 00007000 00:01 454 /bin2/test b7eb0000-b7ed0000 r-xp 00000000 00:01 224 /lib/ld-2.11.3.so b7ed1000-b7ed3000 rw-p 00021000 00:01 224 /lib/ld-2.11.3.so bfbc0000-bfbe1000 rw-p 00000000 00:00 0 [stack] bash-4.1# ./test & cat /proc/${!}/maps [8] 805 00100000-00103000 r-xp 00000000 00:00 0 [vdso] 10000000-10007000 r-xp 00000000 00:01 454 /bin2/test 10017000-10018000 rw-p 00007000 00:01 454 /bin2/test b7b03000-b7b23000 r-xp 00000000 00:01 224 /lib/ld-2.11.3.so b7b24000-b7b26000 rw-p 00021000 00:01 224 /lib/ld-2.11.3.so bfc27000-bfc48000 rw-p 00000000 00:00 0 [stack] bash-4.1# ./test & cat /proc/${!}/maps [9] 807 00100000-00103000 r-xp 00000000 00:00 0 [vdso] 10000000-10007000 r-xp 00000000 00:01 454 /bin2/test 10017000-10018000 rw-p 00007000 00:01 454 /bin2/test b7f37000-b7f57000 r-xp 00000000 00:01 224 /lib/ld-2.11.3.so b7f58000-b7f5a000 rw-p 00021000 00:01 224 /lib/ld-2.11.3.so bff96000-bffb7000 rw-p 00000000 00:00 0 [stack] Signed-off-by: Daniel Walker Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/processor.h | 2 -- arch/powerpc/mm/Makefile | 5 ++--- arch/powerpc/mm/{mmap_64.c => mmap.c} | 0 3 files changed, 2 insertions(+), 5 deletions(-) rename arch/powerpc/mm/{mmap_64.c => mmap.c} (100%) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 14a658363698..7135a257f7cb 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -404,9 +404,7 @@ static inline void prefetchw(const void *x) #define spin_lock_prefetch(x) prefetchw(x) -#ifdef CONFIG_PPC64 #define HAVE_ARCH_PICK_MMAP_LAYOUT -#endif #ifdef CONFIG_PPC64 static inline unsigned long get_clean_sp(unsigned long sp, int is_32) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index cf16b5733eaa..26f29a772414 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y := fault.o mem.o pgtable.o gup.o \ +obj-y := fault.o mem.o pgtable.o gup.o mmap.o \ init_$(CONFIG_WORD_SIZE).o \ pgtable_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o -obj-$(CONFIG_PPC64) += mmap_64.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ slb_low.o slb.o stab.o \ - mmap_64.o $(hash64-y) + $(hash64-y) obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ tlb_hash$(CONFIG_WORD_SIZE).o \ diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c similarity index 100% rename from arch/powerpc/mm/mmap_64.c rename to arch/powerpc/mm/mmap.c From 0962e8004e97409072bb6caee7b3ba948a5fb93a Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 24 Apr 2013 14:26:30 +0800 Subject: [PATCH 014/156] powerpc/prom: Scan reserved-ranges node for memory reservations Based on benh's proposal at https://lists.ozlabs.org/pipermail/linuxppc-dev/2012-September/101237.html, this change provides support for reserving memory from the reserved-ranges node at the root of the device tree. We just call memblock_reserve on these ranges for now. Signed-off-by: Jeremy Kerr Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 8b6f7a99cce2..9c753bc9885d 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -559,6 +559,33 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start, } #endif +static bool __init early_reserve_mem_dt(void) +{ + unsigned long i, len, dt_root; + const __be32 *prop; + + dt_root = of_get_flat_dt_root(); + + prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len); + + if (!prop) + return false; + + /* Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; + + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); + + if (size) + memblock_reserve(base, size); + } + + return true; +} + static void __init early_reserve_mem(void) { u64 base, size; @@ -574,6 +601,14 @@ static void __init early_reserve_mem(void) self_size = initial_boot_params->totalsize; memblock_reserve(self_base, self_size); + /* + * Try looking for reserved-regions property in the DT first; if + * it's present, it'll contain all of the necessary reservation + * info + */ + if (early_reserve_mem_dt()) + return; + #ifdef CONFIG_BLK_DEV_INITRD /* then reserve the initrd, if any */ if (initrd_start && (initrd_end > initrd_start)) From 071df9422ac91c0d290e81f5ae2635c74cda6d00 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Mon, 29 Apr 2013 13:42:43 +1000 Subject: [PATCH 015/156] powerpc: Add a configuration option for early BootX/OpenFirmware debug Signed-off-by: Alistair Popple Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig.debug | 7 +++++++ arch/powerpc/kernel/udbg.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 863d877e0b5f..d86875f3e17e 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -147,6 +147,13 @@ choice enable debugging for the wrong type of machine your kernel _will not boot_. +config PPC_EARLY_DEBUG_BOOTX + bool "BootX or OpenFirmware" + depends on BOOTX_TEXT + help + Select this to enable early debugging for a machine using BootX + or OpenFirmware. + config PPC_EARLY_DEBUG_LPAR bool "LPAR HV Console" depends on PPC_PSERIES diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 9d3fdcd66290..a15837519dca 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -50,7 +50,7 @@ void __init udbg_early_init(void) udbg_init_debug_beat(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) udbg_init_pas_realmode(); -#elif defined(CONFIG_BOOTX_TEXT) +#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) udbg_init_btext(); #elif defined(CONFIG_PPC_EARLY_DEBUG_44x) /* PPC44x debug */ From b9ef7d6b11c120cc402a76013062061bbe0fbaad Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Mon, 29 Apr 2013 13:42:44 +1000 Subject: [PATCH 016/156] powerpc: Update default configurations Update default configurations for systems with CONFIG_BOOTX_TEXT selected so that they continue to print early debug messages as is currently the case. Signed-off-by: Alistair Popple Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/configs/c2k_defconfig | 2 ++ arch/powerpc/configs/g5_defconfig | 2 ++ arch/powerpc/configs/maple_defconfig | 2 ++ arch/powerpc/configs/pmac32_defconfig | 2 ++ arch/powerpc/configs/ppc64_defconfig | 2 ++ arch/powerpc/configs/ppc6xx_defconfig | 2 ++ 6 files changed, 12 insertions(+) diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig index 2a84fd7f631c..671a8f960afa 100644 --- a/arch/powerpc/configs/c2k_defconfig +++ b/arch/powerpc/configs/c2k_defconfig @@ -423,6 +423,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_BOOTX_TEXT=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_PPC_EARLY_DEBUG_BOOTX=y CONFIG_KEYS=y CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index 07b7f2af2dca..1ea22fc24ea8 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -284,6 +284,8 @@ CONFIG_DEBUG_MUTEXES=y CONFIG_LATENCYTOP=y CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BOOTX_TEXT=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_PPC_EARLY_DEBUG_BOOTX=y CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_ECB=m diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig index 02ac96b679b8..2a5afac29861 100644 --- a/arch/powerpc/configs/maple_defconfig +++ b/arch/powerpc/configs/maple_defconfig @@ -138,6 +138,8 @@ CONFIG_DEBUG_STACK_USAGE=y CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_BOOTX_TEXT=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_PPC_EARLY_DEBUG_BOOTX=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m # CONFIG_CRYPTO_ANSI_CPRNG is not set diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 29767a8dfea5..a73626b09051 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -350,6 +350,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_XMON=y CONFIG_XMON_DEFAULT=y CONFIG_BOOTX_TEXT=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_PPC_EARLY_DEBUG_BOOTX=y CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_MD4=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index aef3f71de5ad..c86fcb92358e 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -398,6 +398,8 @@ CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y CONFIG_XMON=y CONFIG_BOOTX_TEXT=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_PPC_EARLY_DEBUG_BOOTX=y CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index be1cb6ea3a36..20ebfaf7234b 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -1264,6 +1264,8 @@ CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_XMON=y CONFIG_BOOTX_TEXT=y +CONFIG_PPC_EARLY_DEBUG=y +CONFIG_PPC_EARLY_DEBUG_BOOTX=y CONFIG_KEYS=y CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y From 70a54a4faec72ee9d12b9c4dfa27bc241deb79a6 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 6 May 2013 21:32:40 +1000 Subject: [PATCH 017/156] powerpc: Fix single step emulation of 32bit overflowed branches Check truncate_if_32bit() on final write to nip. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/lib/sstep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index e15c521846ca..99c7fc16dc0d 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -580,7 +580,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) if (instr & 1) regs->link = regs->nip; if (branch_taken(instr, regs)) - regs->nip = imm; + regs->nip = truncate_if_32bit(regs->msr, imm); return 1; #ifdef CONFIG_PPC64 case 17: /* sc */ From ab9a4183fddf232a46b6255e0d3da5a09f85ecbd Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 9 May 2013 10:42:13 +1000 Subject: [PATCH 018/156] powerpc: Update currituck pci/usb fixup for new board revision The currituck board uses a different IRQ for the pci usb host controller depending on the board revision. This patch adds support for newer board revisions by retrieving the board revision from the FPGA and mapping the appropriate IRQ. Signed-off-by: Alistair Popple Acked-by: Tony Breeds Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/currituck.dts | 5 ++++ arch/powerpc/platforms/44x/currituck.c | 39 ++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts index b801dd06e573..d2c8a872308e 100644 --- a/arch/powerpc/boot/dts/currituck.dts +++ b/arch/powerpc/boot/dts/currituck.dts @@ -103,6 +103,11 @@ interrupts = <34 2>; }; + FPGA0: fpga@50000000 { + compatible = "ibm,currituck-fpga"; + reg = <0x50000000 0x4>; + }; + IIC0: i2c@00000000 { compatible = "ibm,iic-currituck", "ibm,iic"; reg = <0x0 0x00000014>; diff --git a/arch/powerpc/platforms/44x/currituck.c b/arch/powerpc/platforms/44x/currituck.c index ecd3890c40d7..c52e1b3c9be5 100644 --- a/arch/powerpc/platforms/44x/currituck.c +++ b/arch/powerpc/platforms/44x/currituck.c @@ -176,13 +176,48 @@ static int __init ppc47x_probe(void) return 1; } +static int board_rev = -1; +static int __init ppc47x_get_board_rev(void) +{ + u8 fpga_reg0; + void *fpga; + struct device_node *np; + + np = of_find_compatible_node(NULL, NULL, "ibm,currituck-fpga"); + if (!np) + goto fail; + + fpga = of_iomap(np, 0); + of_node_put(np); + if (!fpga) + goto fail; + + fpga_reg0 = ioread8(fpga); + board_rev = fpga_reg0 & 0x03; + pr_info("%s: Found board revision %d\n", __func__, board_rev); + iounmap(fpga); + return 0; + +fail: + pr_info("%s: Unable to find board revision\n", __func__); + return 0; +} +machine_arch_initcall(ppc47x, ppc47x_get_board_rev); + /* Use USB controller should have been hardware swizzled but it wasn't :( */ static void ppc47x_pci_irq_fixup(struct pci_dev *dev) { if (dev->vendor == 0x1033 && (dev->device == 0x0035 || dev->device == 0x00e0)) { - dev->irq = irq_create_mapping(NULL, 47); - pr_info("%s: Mapping irq 47 %d\n", __func__, dev->irq); + if (board_rev == 0) { + dev->irq = irq_create_mapping(NULL, 47); + pr_info("%s: Mapping irq %d\n", __func__, dev->irq); + } else if (board_rev == 2) { + dev->irq = irq_create_mapping(NULL, 49); + pr_info("%s: Mapping irq %d\n", __func__, dev->irq); + } else { + pr_alert("%s: Unknown board revision\n", __func__); + } } } From 4e13c1ac6baa1d6c2b650d66ca89e1e12727ec19 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 May 2013 13:33:09 +1000 Subject: [PATCH 019/156] powerpc/vfio: Enable on PowerNV platform This initializes IOMMU groups based on the IOMMU configuration discovered during the PCI scan on POWERNV (POWER non virtualized) platform. The IOMMU groups are to be used later by the VFIO driver, which is used for PCI pass through. It also implements an API for mapping/unmapping pages for guest PCI drivers and providing DMA window properties. This API is going to be used later by QEMU-VFIO to handle h_put_tce hypercalls from the KVM guest. The iommu_put_tce_user_mode() does only a single page mapping as an API for adding many mappings at once is going to be added later. Although this driver has been tested only on the POWERNV platform, it should work on any platform which supports TCE tables. As h_put_tce hypercall is received by the host kernel and processed by the QEMU (what involves calling the host kernel again), performance is not the best - circa 220MB/s on 10Gb ethernet network. To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option and configure VFIO as required. Cc: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/iommu.h | 26 ++ arch/powerpc/kernel/iommu.c | 323 ++++++++++++++++++++ arch/powerpc/platforms/powernv/pci-ioda.c | 1 + arch/powerpc/platforms/powernv/pci-p5ioc2.c | 5 +- arch/powerpc/platforms/powernv/pci.c | 2 + drivers/iommu/Kconfig | 8 + 6 files changed, 364 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index cbfe678e3dbe..98d14229f893 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -76,6 +76,9 @@ struct iommu_table { struct iommu_pool large_pool; struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ +#ifdef CONFIG_IOMMU_API + struct iommu_group *it_group; +#endif }; struct scatterlist; @@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); +extern void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num); extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl, struct scatterlist *sglist, int nelems, @@ -147,5 +152,26 @@ static inline void iommu_restore(void) } #endif +/* The API to support IOMMU operations for VFIO */ +extern int iommu_tce_clear_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce_value, + unsigned long npages); +extern int iommu_tce_put_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce); +extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, enum dma_data_direction direction); +extern unsigned long iommu_clear_tce(struct iommu_table *tbl, + unsigned long entry); +extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, + unsigned long entry, unsigned long pages); +extern int iommu_put_tce_user_mode(struct iommu_table *tbl, + unsigned long entry, unsigned long tce); + +extern void iommu_flush_tce(struct iommu_table *tbl); +extern int iommu_take_ownership(struct iommu_table *tbl); +extern void iommu_release_ownership(struct iommu_table *tbl); + +extern enum dma_data_direction iommu_tce_direction(unsigned long tce); + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c0d0dbddfba1..b20ff173a671 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include #include @@ -44,6 +46,7 @@ #include #include #include +#include #define DBG(...) @@ -724,6 +727,13 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) if (tbl->it_offset == 0) clear_bit(0, tbl->it_map); +#ifdef CONFIG_IOMMU_API + if (tbl->it_group) { + iommu_group_put(tbl->it_group); + BUG_ON(tbl->it_group); + } +#endif + /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); @@ -860,3 +870,316 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } } + +#ifdef CONFIG_IOMMU_API +/* + * SPAPR TCE API + */ +static void group_release(void *iommu_data) +{ + struct iommu_table *tbl = iommu_data; + tbl->it_group = NULL; +} + +void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num) +{ + struct iommu_group *grp; + char *name; + + grp = iommu_group_alloc(); + if (IS_ERR(grp)) { + pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", + PTR_ERR(grp)); + return; + } + tbl->it_group = grp; + iommu_group_set_iommudata(grp, tbl, group_release); + name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", + pci_domain_number, pe_num); + if (!name) + return; + iommu_group_set_name(grp, name); + kfree(name); +} + +enum dma_data_direction iommu_tce_direction(unsigned long tce) +{ + if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) + return DMA_BIDIRECTIONAL; + else if (tce & TCE_PCI_READ) + return DMA_TO_DEVICE; + else if (tce & TCE_PCI_WRITE) + return DMA_FROM_DEVICE; + else + return DMA_NONE; +} +EXPORT_SYMBOL_GPL(iommu_tce_direction); + +void iommu_flush_tce(struct iommu_table *tbl) +{ + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + /* Make sure updates are seen by hardware */ + mb(); +} +EXPORT_SYMBOL_GPL(iommu_flush_tce); + +int iommu_tce_clear_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce_value, + unsigned long npages) +{ + /* ppc_md.tce_free() does not support any value but 0 */ + if (tce_value) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ioba >>= IOMMU_PAGE_SHIFT; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + npages) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); + +int iommu_tce_put_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce) +{ + if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) + return -EINVAL; + + if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ)) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ioba >>= IOMMU_PAGE_SHIFT; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + 1) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); + +unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) + ppc_md.tce_free(tbl, entry, 1); + else + oldtce = 0; + + spin_unlock(&(pool->lock)); + + return oldtce; +} +EXPORT_SYMBOL_GPL(iommu_clear_tce); + +int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + unsigned long oldtce; + struct page *page; + + for ( ; pages; --pages, ++entry) { + oldtce = iommu_clear_tce(tbl, entry); + if (!oldtce) + continue; + + page = pfn_to_page(oldtce >> PAGE_SHIFT); + WARN_ON(!page); + if (page) { + if (oldtce & TCE_PCI_WRITE) + SetPageDirty(page); + put_page(page); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); + +/* + * hwaddr is a kernel virtual address here (0xc... bazillion), + * tce_build converts it to a physical address. + */ +int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, enum dma_data_direction direction) +{ + int ret = -EBUSY; + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + /* Add new entry if it is not busy */ + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) + ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + + spin_unlock(&(pool->lock)); + + /* if (unlikely(ret)) + pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", + __func__, hwaddr, entry << IOMMU_PAGE_SHIFT, + hwaddr, ret); */ + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_tce_build); + +int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, + unsigned long tce) +{ + int ret; + struct page *page = NULL; + unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK; + enum dma_data_direction direction = iommu_tce_direction(tce); + + ret = get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page); + if (unlikely(ret != 1)) { + /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", + tce, entry << IOMMU_PAGE_SHIFT, ret); */ + return -EFAULT; + } + hwaddr = (unsigned long) page_address(page) + offset; + + ret = iommu_tce_build(tbl, entry, hwaddr, direction); + if (ret) + put_page(page); + + if (ret < 0) + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", + __func__, entry << IOMMU_PAGE_SHIFT, tce, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); + +int iommu_take_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + pr_err("iommu_tce: it_map is not empty"); + return -EBUSY; + } + + memset(tbl->it_map, 0xff, sz); + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_take_ownership); + +void iommu_release_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + memset(tbl->it_map, 0, sz); + + /* Restore bit#0 set by iommu_init_table() */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); +} +EXPORT_SYMBOL_GPL(iommu_release_ownership); + +static int iommu_add_device(struct device *dev) +{ + struct iommu_table *tbl; + int ret = 0; + + if (WARN_ON(dev->iommu_group)) { + pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n", + dev_name(dev), + iommu_group_id(dev->iommu_group)); + return -EBUSY; + } + + tbl = get_iommu_table_base(dev); + if (!tbl || !tbl->it_group) { + pr_debug("iommu_tce: skipping device %s with no tbl\n", + dev_name(dev)); + return 0; + } + + pr_debug("iommu_tce: adding %s to iommu group %d\n", + dev_name(dev), iommu_group_id(tbl->it_group)); + + ret = iommu_group_add_device(tbl->it_group, dev); + if (ret < 0) + pr_err("iommu_tce: %s has not been added, ret=%d\n", + dev_name(dev), ret); + + return ret; +} + +static void iommu_del_device(struct device *dev) +{ + iommu_group_remove_device(dev); +} + +static int iommu_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + return iommu_add_device(dev); + case BUS_NOTIFY_DEL_DEVICE: + iommu_del_device(dev); + return 0; + default: + return 0; + } +} + +static struct notifier_block tce_iommu_bus_nb = { + .notifier_call = iommu_bus_notifier, +}; + +static int __init tce_iommu_init(void) +{ + struct pci_dev *pdev = NULL; + + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE); + + for_each_pci_dev(pdev) + iommu_add_device(&pdev->dev); + + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); + return 0; +} + +subsys_initcall_sync(tce_iommu_init); + +#else + +void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num) +{ +} + +#endif /* CONFIG_IOMMU_API */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9c9d15e4cdf2..2931d97baa56 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -595,6 +595,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, TCE_PCI_SWINV_PAIR; } iommu_init_table(tbl, phb->hose->node); + iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number); return; fail: diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 92b37a0186c9..5d378f2d9e26 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -86,8 +86,11 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - if (phb->p5ioc2.iommu_table.it_map == NULL) + if (phb->p5ioc2.iommu_table.it_map == NULL) { iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); + iommu_register_group(&phb->p5ioc2.iommu_table, + pci_domain_nr(phb->hose->bus), phb->opal_id); + } set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); } diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 277343cc6a3d..e16b729f46f9 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -412,6 +413,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose) pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), be32_to_cpup(sizep), 0); iommu_init_table(tbl, hose->node); + iommu_register_group(tbl, pci_domain_nr(hose->bus), 0); /* Deal with SW invalidated TCEs when needed (BML way) */ swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c332fb98480d..3f3abde8a7f9 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -261,4 +261,12 @@ config SHMOBILE_IOMMU_L1SIZE default 256 if SHMOBILE_IOMMU_ADDRSIZE_64MB default 128 if SHMOBILE_IOMMU_ADDRSIZE_32MB +config SPAPR_TCE_IOMMU + bool "sPAPR TCE IOMMU Support" + depends on PPC_POWERNV + select IOMMU_API + help + Enables bits of IOMMU API required by VFIO. The iommu_ops + is not implemented as it is not necessary for VFIO. + endif # IOMMU_SUPPORT From 5ffd229c02731a91d08ca21e76b503c5bbb5c095 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 May 2013 13:33:10 +1000 Subject: [PATCH 020/156] powerpc/vfio: Implement IOMMU driver for VFIO VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWER guest). Cc: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras Acked-by: Alex Williamson Signed-off-by: Benjamin Herrenschmidt --- Documentation/vfio.txt | 63 +++++ drivers/vfio/Kconfig | 6 + drivers/vfio/Makefile | 1 + drivers/vfio/vfio.c | 1 + drivers/vfio/vfio_iommu_spapr_tce.c | 377 ++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 34 +++ 6 files changed, 482 insertions(+) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 8eda3635a17d..c55533c0adb3 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap interfaces implement the device region access defined by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. + +PPC64 sPAPR implementation note +------------------------------------------------------------------------------- + +This implementation has some specifics: + +1) Only one IOMMU group per container is supported as an IOMMU group +represents the minimal entity which isolation can be guaranteed for and +groups are allocated statically, one per a Partitionable Endpoint (PE) +(PE is often a PCI domain but not always). + +2) The hardware supports so called DMA windows - the PCI address range +within which DMA transfer is allowed, any attempt to access address space +out of the window leads to the whole PE isolation. + +3) PPC64 guests are paravirtualized but not fully emulated. There is an API +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and +currently there is no way to reduce the number of calls. In order to make things +faster, the map/unmap handling has been implemented in real mode which provides +an excellent performance which has limitations such as inability to do +locked pages accounting in real time. + +So 3 additional ioctls have been added: + + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start + of the DMA window on the PCI bus. + + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting + is done at this point. This lets user first to know what + the DMA window is and adjust rlimit before doing any real job. + + VFIO_IOMMU_DISABLE - disables the container. + + +The code flow from the example above should be slightly changed: + + ..... + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) + + /* Get addition sPAPR IOMMU info */ + vfio_iommu_spapr_tce_info spapr_iommu_info; + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info); + + if (ioctl(container, VFIO_IOMMU_ENABLE)) + /* Cannot enable container, may be low rlimit */ + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ + + ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); + ..... + ------------------------------------------------------------------------------- [1] VFIO was originally an acronym for "Virtual Function I/O" in its diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 7cd5dec0abd1..b464687f6e14 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1 depends on VFIO default n +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO && SPAPR_TCE_IOMMU + default n + menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 2398d4a0e38b..72bfabc8629e 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_VFIO) += vfio.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6d78736563de..259ad282ae5d 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1415,6 +1415,7 @@ static int __init vfio_init(void) * drivers. */ request_module_nowait("vfio_iommu_type1"); + request_module_nowait("vfio_iommu_spapr_tce"); return 0; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c new file mode 100644 index 000000000000..bdae7a04af75 --- /dev/null +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -0,0 +1,377 @@ +/* + * VFIO: IOMMU DMA mapping support for TCE on POWER + * + * Copyright (C) 2013 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio_iommu_type1.c: + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "aik@ozlabs.ru" +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE" + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group); + +/* + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation + * + * This code handles mapping and unmapping of user data buffers + * into DMA'ble space using the IOMMU + */ + +/* + * The container descriptor supports only a single group per container. + * Required by the API as the container is not supplied with the IOMMU group + * at the moment of initialization. + */ +struct tce_container { + struct mutex lock; + struct iommu_table *tbl; + bool enabled; +}; + +static int tce_iommu_enable(struct tce_container *container) +{ + int ret = 0; + unsigned long locked, lock_limit, npages; + struct iommu_table *tbl = container->tbl; + + if (!container->tbl) + return -ENXIO; + + if (!current->mm) + return -ESRCH; /* process exited */ + + if (container->enabled) + return -EBUSY; + + /* + * When userspace pages are mapped into the IOMMU, they are effectively + * locked memory, so, theoretically, we need to update the accounting + * of locked pages on each map and unmap. For powerpc, the map unmap + * paths can be very hot, though, and the accounting would kill + * performance, especially since it would be difficult to impossible + * to handle the accounting in real mode only. + * + * To address that, rather than precisely accounting every page, we + * instead account for a worst case on locked memory when the iommu is + * enabled and disabled. The worst case upper bound on locked memory + * is the size of the whole iommu window, which is usually relatively + * small (compared to total memory sizes) on POWER hardware. + * + * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, + * that would effectively kill the guest at random points, much better + * enforcing the limit based on the max that the guest can map. + */ + down_write(¤t->mm->mmap_sem); + npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT; + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + + current->mm->locked_vm += npages; + container->enabled = true; + } + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void tce_iommu_disable(struct tce_container *container) +{ + if (!container->enabled) + return; + + container->enabled = false; + + if (!container->tbl || !current->mm) + return; + + down_write(¤t->mm->mmap_sem); + current->mm->locked_vm -= (container->tbl->it_size << + IOMMU_PAGE_SHIFT) >> PAGE_SHIFT; + up_write(¤t->mm->mmap_sem); +} + +static void *tce_iommu_open(unsigned long arg) +{ + struct tce_container *container; + + if (arg != VFIO_SPAPR_TCE_IOMMU) { + pr_err("tce_vfio: Wrong IOMMU type\n"); + return ERR_PTR(-EINVAL); + } + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return ERR_PTR(-ENOMEM); + + mutex_init(&container->lock); + + return container; +} + +static void tce_iommu_release(void *iommu_data) +{ + struct tce_container *container = iommu_data; + + WARN_ON(container->tbl && !container->tbl->it_group); + tce_iommu_disable(container); + + if (container->tbl && container->tbl->it_group) + tce_iommu_detach_group(iommu_data, container->tbl->it_group); + + mutex_destroy(&container->lock); + + kfree(container); +} + +static long tce_iommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + struct tce_container *container = iommu_data; + unsigned long minsz; + long ret; + + switch (cmd) { + case VFIO_CHECK_EXTENSION: + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0; + + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { + struct vfio_iommu_spapr_tce_info info; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_info, + dma32_window_size); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT; + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT; + info.flags = 0; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_IOMMU_MAP_DMA: { + struct vfio_iommu_type1_dma_map param; + struct iommu_table *tbl = container->tbl; + unsigned long tce, i; + + if (!tbl) + return -ENXIO; + + BUG_ON(!tbl->it_group); + + minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE)) + return -EINVAL; + + if ((param.size & ~IOMMU_PAGE_MASK) || + (param.vaddr & ~IOMMU_PAGE_MASK)) + return -EINVAL; + + /* iova is checked by the IOMMU API */ + tce = param.vaddr; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) + tce |= TCE_PCI_READ; + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + tce |= TCE_PCI_WRITE; + + ret = iommu_tce_put_param_check(tbl, param.iova, tce); + if (ret) + return ret; + + for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) { + ret = iommu_put_tce_user_mode(tbl, + (param.iova >> IOMMU_PAGE_SHIFT) + i, + tce); + if (ret) + break; + tce += IOMMU_PAGE_SIZE; + } + if (ret) + iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT, i); + + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_UNMAP_DMA: { + struct vfio_iommu_type1_dma_unmap param; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + if (param.size & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ret = iommu_tce_clear_param_check(tbl, param.iova, 0, + param.size >> IOMMU_PAGE_SHIFT); + if (ret) + return ret; + + ret = iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT, + param.size >> IOMMU_PAGE_SHIFT); + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_ENABLE: + mutex_lock(&container->lock); + ret = tce_iommu_enable(container); + mutex_unlock(&container->lock); + return ret; + + + case VFIO_IOMMU_DISABLE: + mutex_lock(&container->lock); + tce_iommu_disable(container); + mutex_unlock(&container->lock); + return 0; + } + + return -ENOTTY; +} + +static int tce_iommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + int ret; + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + + /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + if (container->tbl) { + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", + iommu_group_id(container->tbl->it_group), + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else if (container->enabled) { + pr_err("tce_vfio: attaching group #%u to enabled container\n", + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else { + ret = iommu_take_ownership(tbl); + if (!ret) + container->tbl = tbl; + } + + mutex_unlock(&container->lock); + + return ret; +} + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + if (tbl != container->tbl) { + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", + iommu_group_id(iommu_group), + iommu_group_id(tbl->it_group)); + } else { + if (container->enabled) { + pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", + iommu_group_id(tbl->it_group)); + tce_iommu_disable(container); + } + + /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + container->tbl = NULL; + iommu_release_ownership(tbl); + } + mutex_unlock(&container->lock); +} + +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { + .name = "iommu-vfio-powerpc", + .owner = THIS_MODULE, + .open = tce_iommu_open, + .release = tce_iommu_release, + .ioctl = tce_iommu_ioctl, + .attach_group = tce_iommu_attach_group, + .detach_group = tce_iommu_detach_group, +}; + +static int __init tce_iommu_init(void) +{ + return vfio_register_iommu_driver(&tce_iommu_driver_ops); +} + +static void __exit tce_iommu_cleanup(void) +{ + vfio_unregister_iommu_driver(&tce_iommu_driver_ops); +} + +module_init(tce_iommu_init); +module_exit(tce_iommu_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 284ff2436829..87ee4f4cff25 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -22,6 +22,7 @@ /* Extensions */ #define VFIO_TYPE1_IOMMU 1 +#define VFIO_SPAPR_TCE_IOMMU 2 /* * The IOCTL interface is designed for extensibility by embedding the @@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) +/* + * IOCTLs to enable/disable IOMMU container usage. + * No parameters are supported. + */ +#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) +#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) + +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + +/* + * The SPAPR TCE info struct provides the information about the PCI bus + * address ranges available for DMA, these values are programmed into + * the hardware so the guest has to know that information. + * + * The DMA 32 bit window start is an absolute PCI bus address. + * The IOVA address passed via map/unmap ioctls are absolute PCI bus + * addresses too so the window works as a filter rather than an offset + * for IOVA addresses. + * + * A flag will need to be added if other page sizes are supported, + * so as defined here, it is always 4k. + */ +struct vfio_iommu_spapr_tce_info { + __u32 argsz; + __u32 flags; /* reserved for future use */ + __u32 dma32_window_start; /* 32 bit window start (bytes) */ + __u32 dma32_window_size; /* 32 bit window size (bytes) */ +}; + +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/* ***************************************************************** */ + #endif /* _UAPIVFIO_H */ From 5b25199eff8e124297e6e95392f1719d20daca89 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 May 2013 13:33:11 +1000 Subject: [PATCH 021/156] powerpc/vfio: Enable on pSeries platform The enables VFIO on the pSeries platform, enabling user space programs to access PCI devices directly. Signed-off-by: Alexey Kardashevskiy Cc: David Gibson Signed-off-by: Paul Mackerras Acked-by: Alex Williamson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/iommu.c | 4 ++++ drivers/iommu/Kconfig | 2 +- drivers/vfio/Kconfig | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 86ae364900d6..23fc1dcf4434 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -614,6 +614,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -658,6 +659,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci->phb->node); iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->iommu_table); } } @@ -684,6 +686,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) phb->node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); + iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); return; } @@ -1184,6 +1187,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pci->phb->node); iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->iommu_table); } else { pr_debug(" found DMA window, table: %p\n", pci->iommu_table); diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 3f3abde8a7f9..01730b2b9954 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -263,7 +263,7 @@ config SHMOBILE_IOMMU_L1SIZE config SPAPR_TCE_IOMMU bool "sPAPR TCE IOMMU Support" - depends on PPC_POWERNV + depends on PPC_POWERNV || PPC_PSERIES select IOMMU_API help Enables bits of IOMMU API required by VFIO. The iommu_ops diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index b464687f6e14..26b3d9d1409f 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -12,7 +12,7 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 - select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV + select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. From d8899bb2be91b3a19ebf82b138232919ffcf833a Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 22 May 2013 09:50:58 +0530 Subject: [PATCH 022/156] powerpc: Debug control and status registers are 32bit Signed-off-by: Bharat Bhushan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/processor.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 7135a257f7cb..2b5a39c67df4 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -168,10 +168,10 @@ struct thread_struct { * The following help to manage the use of Debug Control Registers * om the BookE platforms. */ - unsigned long dbcr0; - unsigned long dbcr1; + uint32_t dbcr0; + uint32_t dbcr1; #ifdef CONFIG_BOOKE - unsigned long dbcr2; + uint32_t dbcr2; #endif /* * The stored value of the DBSR register will be the value at the @@ -179,7 +179,7 @@ struct thread_struct { * user (will never be written to) and has value while helping to * describe the reason for the last debug trap. Torez */ - unsigned long dbsr; + uint32_t dbsr; /* * The following will contain addresses used by debug applications * to help trace and trap on particular address locations. From 13d543cd79963133cd26748547552c99df8d23a7 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 22 May 2013 09:50:59 +0530 Subject: [PATCH 023/156] powerpc: Restore dbcr0 on user space exit On BookE (Branch taken + Single Step) is as same as Branch Taken on BookS and in Linux we simulate BookS behavior for BookE as well. When doing so, in Branch taken handling we want to set DBCR0_IC but we update the current->thread->dbcr0 and not DBCR0. Now on 64bit the current->thread.dbcr0 (and other debug registers) is synchronized ONLY on context switch flow. But after handling Branch taken in debug exception if we return back to user space without context switch then single stepping change (DBCR0_ICMP) does not get written in h/w DBCR0 and Instruction Complete exception does not happen. This fixes using ptrace reliably on BookE-PowerPC lmbench latency test (lat_syscall) Results are (they varies a little on each run) 1) ./lat_syscall /dev/shm/uImage action: Open read write stat fstat null Before: 3.8618 0.2017 0.2851 1.6789 0.2256 0.0856 After: 3.8580 0.2017 0.2851 1.6955 0.2255 0.0856 1) ./lat_syscall -P 2 -N 10 /dev/shm/uImage action: Open read write stat fstat null Before: 4.1388 0.2238 0.3066 1.7106 0.2256 0.0856 After: 4.1413 0.2236 0.3062 1.7107 0.2256 0.0856 [ Slightly modified to avoid extra branch in the fast path on Book3S and fix build on all non-BookE 64-bit -- BenH ] Signed-off-by: Bharat Bhushan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/asm-offsets.c | 6 +++--- arch/powerpc/kernel/entry_64.S | 30 ++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6f16ffafa6f0..dc684b1af1c4 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -105,9 +105,6 @@ int main(void) DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid)); #else /* CONFIG_PPC64 */ DEFINE(PGDIR, offsetof(struct thread_struct, pgdir)); -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0)); -#endif #ifdef CONFIG_SPE DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0])); DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc)); @@ -115,6 +112,9 @@ int main(void) DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe)); #endif /* CONFIG_SPE */ #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) + DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0)); +#endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); #endif diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 8741c854e03d..ab15b8d057ad 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -629,21 +629,43 @@ _GLOBAL(ret_from_except_lite) CURRENT_THREAD_INFO(r9, r1) ld r3,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3E + ld r10,PACACURRENT(r13) +#endif /* CONFIG_PPC_BOOK3E */ ld r4,TI_FLAGS(r9) andi. r3,r3,MSR_PR beq resume_kernel +#ifdef CONFIG_PPC_BOOK3E + lwz r3,(THREAD+THREAD_DBCR0)(r10) +#endif /* CONFIG_PPC_BOOK3E */ /* Check current_thread_info()->flags */ andi. r0,r4,_TIF_USER_WORK_MASK +#ifdef CONFIG_PPC_BOOK3E + bne 1f + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + andis. r0,r3,DBCR0_IDM@h beq restore - - andi. r0,r4,_TIF_NEED_RESCHED - beq 1f + mfmsr r0 + rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ + mtmsr r0 + mtspr SPRN_DBCR0,r3 + li r10, -1 + mtspr SPRN_DBSR,r10 + b restore +#else + beq restore +#endif +1: andi. r0,r4,_TIF_NEED_RESCHED + beq 2f bl .restore_interrupts SCHEDULE_USER b .ret_from_except_lite -1: bl .save_nvgprs +2: bl .save_nvgprs bl .restore_interrupts addi r3,r1,STACK_FRAME_OVERHEAD bl .do_notify_resume From a5b45ded097908d40803b5c2770259398811b24e Mon Sep 17 00:00:00 2001 From: liguang Date: Thu, 30 May 2013 14:47:53 +0800 Subject: [PATCH 024/156] powerpc/smp: Use '==' instead of '<' for system_state 'system_state < SYSTEM_RUNNING' will have same effect with 'system_state == SYSTEM_BOOTING', but the later one is more clearer. Signed-off-by: liguang Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index d35dbbc8ec79..f75f6fcac729 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -142,7 +142,7 @@ static int smp_cell_cpu_bootable(unsigned int nr) * during boot if the user requests it. Odd-numbered * cpus are assumed to be secondary threads. */ - if (system_state < SYSTEM_RUNNING && + if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT) && !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 88c9459c3e07..77784aead1c2 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -51,7 +51,7 @@ static int pnv_smp_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 12bc8c3663ad..306643cc9dbc 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -192,7 +192,7 @@ static int smp_pSeries_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot From 1b7e0cbe4969dd4c2945b2fb8d6881a03b4d0ac7 Mon Sep 17 00:00:00 2001 From: liguang Date: Thu, 30 May 2013 15:20:33 +0800 Subject: [PATCH 025/156] powerpc/pseries: Use 'true' instead of '1' for orderly_poweroff orderly_poweroff is expecting a bool parameter, so use 'true' instead '1' Signed-off-by: liguang Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/ras.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index c4dfccd3a3d9..7b3cbde8c783 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -83,7 +83,7 @@ static void handle_system_shutdown(char event_modifier) switch (event_modifier) { case EPOW_SHUTDOWN_NORMAL: pr_emerg("Firmware initiated power off"); - orderly_poweroff(1); + orderly_poweroff(true); break; case EPOW_SHUTDOWN_ON_UPS: @@ -95,13 +95,13 @@ static void handle_system_shutdown(char event_modifier) pr_emerg("Loss of system critical functions reported by " "firmware"); pr_emerg("Check RTAS error log for details"); - orderly_poweroff(1); + orderly_poweroff(true); break; case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: pr_emerg("Ambient temperature too high reported by firmware"); pr_emerg("Check RTAS error log for details"); - orderly_poweroff(1); + orderly_poweroff(true); break; default: @@ -162,7 +162,7 @@ void rtas_parse_epow_errlog(struct rtas_error_log *log) case EPOW_SYSTEM_HALT: pr_emerg("Firmware initiated power off"); - orderly_poweroff(1); + orderly_poweroff(true); break; case EPOW_MAIN_ENCLOSURE: From 475e68cfdde39e6d95055999b0cb42fdb2bea0ca Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 5 Jun 2013 13:02:26 +1000 Subject: [PATCH 026/156] powerpc: Align thread->fpr to 16 bytes On newer CPUs we use VSX loads and stores to the thread->fpr array. For best performance we need to ensure 16 byte alignment. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 2b5a39c67df4..9fe1129efa02 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -200,7 +200,7 @@ struct thread_struct { #endif #endif /* FP and VSX 0-31 register set */ - double fpr[32][TS_FPRWIDTH]; + double fpr[32][TS_FPRWIDTH] __attribute__((aligned(16))); struct { unsigned int pad; From 5fb621698e94e3af8b413d9439041fde48e2784d Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 5 Jun 2013 15:34:02 +0800 Subject: [PATCH 027/156] powerpc/eeh: Fix fetching bus for single-dev-PE While running Linux as guest on top of phyp, we possiblly have PE that includes single PCI device. However, we didn't return its PCI bus correctly and it leads to failure on recovery from EEH errors for single-dev-PE. The patch fixes the issue. Cc: # v3.7+ Cc: Steve Best Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/eeh_pe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c index fe43d1aa2cf1..9d4a9e8562b2 100644 --- a/arch/powerpc/platforms/pseries/eeh_pe.c +++ b/arch/powerpc/platforms/pseries/eeh_pe.c @@ -639,7 +639,8 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) if (pe->type & EEH_PE_PHB) { bus = pe->phb->bus; - } else if (pe->type & EEH_PE_BUS) { + } else if (pe->type & EEH_PE_BUS || + pe->type & EEH_PE_DEVICE) { edev = list_first_entry(&pe->edevs, struct eeh_dev, list); pdev = eeh_dev_to_pci_dev(edev); if (pdev) From 2d5c121678ce2c7e12ecc174121badeea0b98fe0 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 5 Jun 2013 15:34:03 +0800 Subject: [PATCH 028/156] powerpc/eeh: Enhance converting EEH dev Under some special circumstances, the EEH device doesn't have the associated device tree node or PCI device. The patch enhances those functions converting EEH device to device tree node or PCI device accordingly to avoid unnecessary system crash. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index a80e32b46c11..e32c3c53eb8b 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -95,12 +95,12 @@ struct eeh_dev { static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev) { - return edev->dn; + return edev ? edev->dn : NULL; } static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev) { - return edev->pdev; + return edev ? edev->pdev : NULL; } /* From 1bf247f8df2e37b53d0415015e56910677626108 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:20:55 +0530 Subject: [PATCH 029/156] powerpc/pseries: Remove syslog prefix in uncompressed oops text Removal of syslog prefix in the uncompressed oops text will help in capturing more oops data. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 8733a86ad52e..e54a8b7f5c64 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -619,7 +619,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, } if (rc != 0) { kmsg_dump_rewind(dumper); - kmsg_dump_get_buffer(dumper, true, + kmsg_dump_get_buffer(dumper, false, oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; *oops_len = (u16) text_len; From b1f70e1f72179f7afe02f4131ed15da406f93e0d Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:21:05 +0530 Subject: [PATCH 030/156] powerpc/pseries: Add version and timestamp to oops header Introduce version and timestamp information in the oops header. oops_log_info (oops header) holds version (to distinguish between old and new format oops header), length of the oops text (compressed or uncompressed) and timestamp. The version field will sit in the same place as the length in old headers. version is assigned 5000 (greater than oops partition size) so that existing tools will refuse to dump new style partitions as the length is too large. The updated tools will work with both old and new format headers. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 57 +++++++++++++++++--------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index e54a8b7f5c64..742735acd8b6 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -29,6 +29,13 @@ /* Max bytes to read/write in one go */ #define NVRW_CNT 0x20 +/* + * Set oops header version to distingush between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + static unsigned int nvram_size; static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ @@ -67,6 +74,12 @@ static const char *pseries_nvram_os_partitions[] = { NULL }; +struct oops_log_info { + u16 version; + u16 report_length; + u64 timestamp; +} __attribute__((packed)); + static void oops_to_nvram(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); @@ -83,28 +96,28 @@ static unsigned long last_unread_rtas_event; /* timestamp */ * big_oops_buf[] holds the uncompressed text we're capturing. * - * oops_buf[] holds the compressed text, preceded by a prefix. - * The prefix is just a u16 holding the length of the compressed* text. - * (*Or uncompressed, if compression fails.) oops_buf[] gets written - * to NVRAM. + * oops_buf[] holds the compressed text, preceded by a oops header. + * oops header has u16 holding the version of oops header (to differentiate + * between old and new format header) followed by u16 holding the length of + * the compressed* text (*Or uncompressed, if compression fails.) and u64 + * holding the timestamp. oops_buf[] gets written to NVRAM. * - * oops_len points to the prefix. oops_data points to the compressed text. + * oops_log_info points to the header. oops_data points to the compressed text. * * +- oops_buf - * | +- oops_data - * v v - * +------------+-----------------------------------------------+ - * | length | text | - * | (2 bytes) | (oops_data_sz bytes) | - * +------------+-----------------------------------------------+ + * | +- oops_data + * v v + * +-----------+-----------+-----------+------------------------+ + * | version | length | timestamp | text | + * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes) | + * +-----------+-----------+-----------+------------------------+ * ^ - * +- oops_len + * +- oops_log_info * * We preallocate these buffers during init to avoid kmalloc during oops/panic. */ static size_t big_oops_buf_sz; static char *big_oops_buf, *oops_buf; -static u16 *oops_len; static char *oops_data; static size_t oops_data_sz; @@ -425,9 +438,8 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) oops_log_partition.name); return; } - oops_len = (u16*) oops_buf; - oops_data = oops_buf + sizeof(u16); - oops_data_sz = oops_log_partition.size - sizeof(u16); + oops_data = oops_buf + sizeof(struct oops_log_info); + oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); /* * Figure compression (preceded by elimination of each line's @@ -555,6 +567,7 @@ error: /* Compress the text from big_oops_buf into oops_buf. */ static int zip_oops(size_t text_len) { + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, oops_data_sz); if (zipped_len < 0) { @@ -562,7 +575,9 @@ static int zip_oops(size_t text_len) pr_err("nvram: logging uncompressed oops/panic report\n"); return -1; } - *oops_len = (u16) zipped_len; + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) zipped_len; + oops_hdr->timestamp = get_seconds(); return 0; } @@ -576,6 +591,7 @@ static int zip_oops(size_t text_len) static void oops_to_nvram(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; static unsigned int oops_count = 0; static bool panicking = false; static DEFINE_SPINLOCK(lock); @@ -622,11 +638,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, kmsg_dump_get_buffer(dumper, false, oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; - *oops_len = (u16) text_len; + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) text_len; + oops_hdr->timestamp = get_seconds(); } (void) nvram_write_os_partition(&oops_log_partition, oops_buf, - (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count); + (int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type, + ++oops_count); spin_unlock_irqrestore(&lock, flags); } From 126746101e89991f179209ef58ab5937bc5ea4a3 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:21:16 +0530 Subject: [PATCH 031/156] powerpc/pseries: Introduce generic read function to read nvram-partitions Introduce generic read function to read nvram partitions other than rtas. nvram_read_error_log will be retained which is used to read rtas partition from rtasd. nvram_read_partition is the generic read function to read from any nvram partition. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 32 ++++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 742735acd8b6..088f023d8f25 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -293,34 +293,35 @@ int nvram_write_error_log(char * buff, int length, return rc; } -/* nvram_read_error_log +/* nvram_read_partition * - * Reads nvram for error log for at most 'length' + * Reads nvram partition for at most 'length' */ -int nvram_read_error_log(char * buff, int length, - unsigned int * err_type, unsigned int * error_log_cnt) +int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt) { int rc; loff_t tmp_index; struct err_log_info info; - if (rtas_log_partition.index == -1) + if (part->index == -1) return -1; - if (length > rtas_log_partition.size) - length = rtas_log_partition.size; + if (length > part->size) + length = part->size; - tmp_index = rtas_log_partition.index; + tmp_index = part->index; rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc); return rc; } rc = ppc_md.nvram_read(buff, length, &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc); return rc; } @@ -330,6 +331,17 @@ int nvram_read_error_log(char * buff, int length, return 0; } +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char *buff, int length, + unsigned int *err_type, unsigned int *error_log_cnt) +{ + return nvram_read_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); +} + /* This doesn't actually zero anything, but it sets the event_logged * word to tell that this event is safely in syslog. */ From d7563c94f728d8c9228cfddbb044c843b2e243e8 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:21:32 +0530 Subject: [PATCH 032/156] powerpc/pseries: Read/Write oops nvram partition via pstore IBM's p series machines provide persistent storage for LPARs through NVRAM. NVRAM's lnx,oops-log partition is used to log oops messages. Currently the kernel provides the contents of p-series NVRAM only as a simple stream of bytes via /dev/nvram, which must be interpreted in user space by the nvram command in the powerpc-utils package. This patch set exploits the pstore subsystem to expose oops partition in NVRAM as a separate file in /dev/pstore. For instance, Oops messages will be stored in a file named [dmesg-nvram-2]. In case pstore registration fails it will fall back to kmsg_dump mechanism. This patch will read/write the oops messages from/to this partition via pstore. Signed-off-by: Jim Keniston Signed-off-by: Aruna Balakrishnaiah Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 172 ++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 088f023d8f25..9edec8ee02ef 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -127,6 +128,14 @@ static size_t oops_data_sz; #define MEM_LEVEL 4 static struct z_stream_s stream; +#ifdef CONFIG_PSTORE +static enum pstore_type_id nvram_type_ids[] = { + PSTORE_TYPE_DMESG, + -1 +}; +static int read_type; +#endif + static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { unsigned int i; @@ -430,6 +439,149 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition return 0; } +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +#ifdef CONFIG_PSTORE +static int nvram_pstore_open(struct pstore_info *psi) +{ + /* Reset the iterator to start reading partitions again */ + read_type = -1; + return 0; +} + +/** + * nvram_pstore_write - pstore write callback for nvram + * @type: Type of message logged + * @reason: reason behind dump (oops/panic) + * @id: identifier to indicate the write performed + * @part: pstore writes data to registered buffer in parts, + * part number will indicate the same. + * @count: Indicates oops count + * @size: number of bytes written to the registered buffer + * @psi: registered pstore_info structure + * + * Called by pstore_dump() when an oops or panic report is logged in the + * printk buffer. + * Returns 0 on successful write. + */ +static int nvram_pstore_write(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, int count, + size_t size, struct pstore_info *psi) +{ + int rc; + struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; + + /* part 1 has the recent messages from printk buffer */ + if (part > 1 || type != PSTORE_TYPE_DMESG || + clobbering_unread_rtas_event()) + return -1; + + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) size; + oops_hdr->timestamp = get_seconds(); + rc = nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC, + count); + + if (rc != 0) + return rc; + + *id = part; + return 0; +} + +/* + * Reads the oops/panic report. + * Returns the length of the data we read from each partition. + * Returns 0 if we've been called before. + */ +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + struct pstore_info *psi) +{ + struct oops_log_info *oops_hdr; + unsigned int err_type, id_no; + struct nvram_os_partition *part = NULL; + char *buff = NULL; + + read_type++; + + switch (nvram_type_ids[read_type]) { + case PSTORE_TYPE_DMESG: + part = &oops_log_partition; + *type = PSTORE_TYPE_DMESG; + break; + default: + return 0; + } + + buff = kmalloc(part->size, GFP_KERNEL); + + if (!buff) + return -ENOMEM; + + if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) { + kfree(buff); + return 0; + } + + *count = 0; + *id = id_no; + oops_hdr = (struct oops_log_info *)buff; + *buf = buff + sizeof(*oops_hdr); + time->tv_sec = oops_hdr->timestamp; + time->tv_nsec = 0; + return oops_hdr->report_length; +} + +static struct pstore_info nvram_pstore_info = { + .owner = THIS_MODULE, + .name = "nvram", + .open = nvram_pstore_open, + .read = nvram_pstore_read, + .write = nvram_pstore_write, +}; + +static int nvram_pstore_init(void) +{ + int rc = 0; + + nvram_pstore_info.buf = oops_data; + nvram_pstore_info.bufsize = oops_data_sz; + + rc = pstore_register(&nvram_pstore_info); + if (rc != 0) + pr_err("nvram: pstore_register() failed, defaults to " + "kmsg_dump; returned %d\n", rc); + else + /*TODO: Support compression when pstore is configured */ + pr_info("nvram: Compression of oops text supported only when " + "pstore is not configured"); + + return rc; +} +#else +static int nvram_pstore_init(void) +{ + return -1; +} +#endif + static void __init nvram_init_oops_partition(int rtas_partition_exists) { int rc; @@ -453,6 +605,11 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) oops_data = oops_buf + sizeof(struct oops_log_info); oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); + rc = nvram_pstore_init(); + + if (!rc) + return; + /* * Figure compression (preceded by elimination of each line's * severity prefix) will reduce the oops/panic report to at most @@ -525,21 +682,6 @@ int __init pSeries_nvram_init(void) return 0; } -/* - * Are we using the ibm,rtas-log for oops/panic reports? And if so, - * would logging this oops/panic overwrite an RTAS event that rtas_errd - * hasn't had a chance to read and process? Return 1 if so, else 0. - * - * We assume that if rtas_errd hasn't read the RTAS event in - * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. - */ -static int clobbering_unread_rtas_event(void) -{ - return (oops_log_partition.index == rtas_log_partition.index - && last_unread_rtas_event - && get_seconds() - last_unread_rtas_event <= - NVRAM_RTAS_READ_TIMEOUT); -} /* Derived from logfs_compress() */ static int nvram_compress(const void *in, void *out, size_t inlen, From 69020eea973d95766e905ee0ce7773e0027377a3 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:21:44 +0530 Subject: [PATCH 033/156] powerpc/pseries: Read rtas partition via pstore This patch set exploits the pstore subsystem to read details of rtas partition in NVRAM to a separate file in /dev/pstore. For instance, rtas details will be stored in a file named [rtas-nvram-4]. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 33 ++++++++++++++++++++------ fs/pstore/inode.c | 3 +++ include/linux/pstore.h | 2 ++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 9edec8ee02ef..78d72f0175ad 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -131,9 +131,11 @@ static struct z_stream_s stream; #ifdef CONFIG_PSTORE static enum pstore_type_id nvram_type_ids[] = { PSTORE_TYPE_DMESG, + PSTORE_TYPE_PPC_RTAS, -1 }; static int read_type; +static unsigned long last_rtas_event; #endif static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) @@ -297,8 +299,13 @@ int nvram_write_error_log(char * buff, int length, { int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, err_type, error_log_cnt); - if (!rc) + if (!rc) { last_unread_rtas_event = get_seconds(); +#ifdef CONFIG_PSTORE + last_rtas_event = get_seconds(); +#endif + } + return rc; } @@ -506,7 +513,7 @@ static int nvram_pstore_write(enum pstore_type_id type, } /* - * Reads the oops/panic report. + * Reads the oops/panic report and ibm,rtas-log partition. * Returns the length of the data we read from each partition. * Returns 0 if we've been called before. */ @@ -526,6 +533,12 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, part = &oops_log_partition; *type = PSTORE_TYPE_DMESG; break; + case PSTORE_TYPE_PPC_RTAS: + part = &rtas_log_partition; + *type = PSTORE_TYPE_PPC_RTAS; + time->tv_sec = last_rtas_event; + time->tv_nsec = 0; + break; default: return 0; } @@ -542,11 +555,17 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, *count = 0; *id = id_no; - oops_hdr = (struct oops_log_info *)buff; - *buf = buff + sizeof(*oops_hdr); - time->tv_sec = oops_hdr->timestamp; - time->tv_nsec = 0; - return oops_hdr->report_length; + + if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { + oops_hdr = (struct oops_log_info *)buff; + *buf = buff + sizeof(*oops_hdr); + time->tv_sec = oops_hdr->timestamp; + time->tv_nsec = 0; + return oops_hdr->report_length; + } + + *buf = buff; + return part->size; } static struct pstore_info nvram_pstore_info = { diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index e4bcb2cf055a..ec24f9ceb5ed 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -324,6 +324,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_MCE: sprintf(name, "mce-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_RTAS: + sprintf(name, "rtas-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 75d01760c911..d7a8fe938c0f 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -35,6 +35,8 @@ enum pstore_type_id { PSTORE_TYPE_MCE = 1, PSTORE_TYPE_CONSOLE = 2, PSTORE_TYPE_FTRACE = 3, + /* PPC64 partition types */ + PSTORE_TYPE_PPC_RTAS = 4, PSTORE_TYPE_UNKNOWN = 255 }; From edf38465a32c2820350da45a2231d2c53ad2d3c0 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:21:59 +0530 Subject: [PATCH 034/156] powerpc/pseries: Distinguish between a os-partition and non-os partition Introduce os_partition member in nvram_os_partition structure to identify if the partition is an os partition or not. This will be useful to handle non-os partitions of-config and common. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 78d72f0175ad..714ed8ac7d59 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -53,20 +53,23 @@ struct nvram_os_partition { int min_size; /* minimum acceptable size (0 means req_size) */ long size; /* size of data portion (excluding err_log_info) */ long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ }; static struct nvram_os_partition rtas_log_partition = { .name = "ibm,rtas-log", .req_size = 2079, .min_size = 1055, - .index = -1 + .index = -1, + .os_partition = true }; static struct nvram_os_partition oops_log_partition = { .name = "lnx,oops-log", .req_size = 4000, .min_size = 2000, - .index = -1 + .index = -1, + .os_partition = true }; static const char *pseries_nvram_os_partitions[] = { From f33f748c964f6a6ee272b1c794b52f54f4da1d04 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:22:10 +0530 Subject: [PATCH 035/156] powerpc/pseries: Read of-config partition via pstore This patch set exploits the pstore subsystem to read details of of-config partition in NVRAM to a separate file in /dev/pstore. For instance, of-config partition details will be stored in a file named [of-nvram-5]. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 55 +++++++++++++++++++++----- fs/pstore/inode.c | 3 ++ include/linux/pstore.h | 1 + 3 files changed, 50 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 714ed8ac7d59..f7392f6ea7b3 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -132,9 +132,16 @@ static size_t oops_data_sz; static struct z_stream_s stream; #ifdef CONFIG_PSTORE +static struct nvram_os_partition of_config_partition = { + .name = "of-config", + .index = -1, + .os_partition = false +}; + static enum pstore_type_id nvram_type_ids[] = { PSTORE_TYPE_DMESG, PSTORE_TYPE_PPC_RTAS, + PSTORE_TYPE_PPC_OF, -1 }; static int read_type; @@ -332,10 +339,15 @@ int nvram_read_partition(struct nvram_os_partition *part, char *buff, tmp_index = part->index; - rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc); - return rc; + if (part->os_partition) { + rc = ppc_md.nvram_read((char *)&info, + sizeof(struct err_log_info), + &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, + rc); + return rc; + } } rc = ppc_md.nvram_read(buff, length, &tmp_index); @@ -344,8 +356,10 @@ int nvram_read_partition(struct nvram_os_partition *part, char *buff, return rc; } - *error_log_cnt = info.seq_num; - *err_type = info.error_type; + if (part->os_partition) { + *error_log_cnt = info.seq_num; + *err_type = info.error_type; + } return 0; } @@ -516,7 +530,7 @@ static int nvram_pstore_write(enum pstore_type_id type, } /* - * Reads the oops/panic report and ibm,rtas-log partition. + * Reads the oops/panic report, rtas and of-config partition. * Returns the length of the data we read from each partition. * Returns 0 if we've been called before. */ @@ -525,9 +539,11 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, struct pstore_info *psi) { struct oops_log_info *oops_hdr; - unsigned int err_type, id_no; + unsigned int err_type, id_no, size = 0; struct nvram_os_partition *part = NULL; char *buff = NULL; + int sig = 0; + loff_t p; read_type++; @@ -542,10 +558,29 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_sec = last_rtas_event; time->tv_nsec = 0; break; + case PSTORE_TYPE_PPC_OF: + sig = NVRAM_SIG_OF; + part = &of_config_partition; + *type = PSTORE_TYPE_PPC_OF; + *id = PSTORE_TYPE_PPC_OF; + time->tv_sec = 0; + time->tv_nsec = 0; + break; default: return 0; } + if (!part->os_partition) { + p = nvram_find_partition(part->name, sig, &size); + if (p <= 0) { + pr_err("nvram: Failed to find partition %s, " + "err %d\n", part->name, (int)p); + return 0; + } + part->index = p; + part->size = size; + } + buff = kmalloc(part->size, GFP_KERNEL); if (!buff) @@ -557,7 +592,9 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, } *count = 0; - *id = id_no; + + if (part->os_partition) + *id = id_no; if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { oops_hdr = (struct oops_log_info *)buff; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index ec24f9ceb5ed..73148aef9e31 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -327,6 +327,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_RTAS: sprintf(name, "rtas-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OF: + sprintf(name, "powerpc-ofw-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index d7a8fe938c0f..615dc18638b8 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -37,6 +37,7 @@ enum pstore_type_id { PSTORE_TYPE_FTRACE = 3, /* PPC64 partition types */ PSTORE_TYPE_PPC_RTAS = 4, + PSTORE_TYPE_PPC_OF = 5, PSTORE_TYPE_UNKNOWN = 255 }; From a5e4797b0f46819a74a7233825137ed5d2f51b51 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 6 Jun 2013 00:22:20 +0530 Subject: [PATCH 036/156] powerpc/pseries: Read common partition via pstore This patch exploits pstore subsystem to read details of common partition in NVRAM to a separate file in /dev/pstore. For instance, common partition details will be stored in a file named [common-nvram-6]. Signed-off-by: Aruna Balakrishnaiah Reviewed-by: Jim Keniston Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 17 ++++++++++++++++- fs/pstore/inode.c | 3 +++ include/linux/pstore.h | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index f7392f6ea7b3..14cc486709f6 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -138,10 +138,17 @@ static struct nvram_os_partition of_config_partition = { .os_partition = false }; +static struct nvram_os_partition common_partition = { + .name = "common", + .index = -1, + .os_partition = false +}; + static enum pstore_type_id nvram_type_ids[] = { PSTORE_TYPE_DMESG, PSTORE_TYPE_PPC_RTAS, PSTORE_TYPE_PPC_OF, + PSTORE_TYPE_PPC_COMMON, -1 }; static int read_type; @@ -530,7 +537,7 @@ static int nvram_pstore_write(enum pstore_type_id type, } /* - * Reads the oops/panic report, rtas and of-config partition. + * Reads the oops/panic report, rtas, of-config and common partition. * Returns the length of the data we read from each partition. * Returns 0 if we've been called before. */ @@ -566,6 +573,14 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_sec = 0; time->tv_nsec = 0; break; + case PSTORE_TYPE_PPC_COMMON: + sig = NVRAM_SIG_SYS; + part = &common_partition; + *type = PSTORE_TYPE_PPC_COMMON; + *id = PSTORE_TYPE_PPC_COMMON; + time->tv_sec = 0; + time->tv_nsec = 0; + break; default: return 0; } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 73148aef9e31..08c3d76b24ca 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -330,6 +330,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PPC_OF: sprintf(name, "powerpc-ofw-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_COMMON: + sprintf(name, "powerpc-common-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 615dc18638b8..656699fcc7d7 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -38,6 +38,7 @@ enum pstore_type_id { /* PPC64 partition types */ PSTORE_TYPE_PPC_RTAS = 4, PSTORE_TYPE_PPC_OF = 5, + PSTORE_TYPE_PPC_COMMON = 6, PSTORE_TYPE_UNKNOWN = 255 }; From 04ae9001719c3f0012d239a7c5aa4136f6b6541d Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 9 Jun 2013 17:00:42 +1000 Subject: [PATCH 037/156] powerpc/math-emu: Fix decoding of some instructions The decoding of some instructions such as fsqrt{s} was incorrect, using the wrong registers, and thus could not work. This fixes it and also adds a couple of place holders for missing instructions. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/math-emu/Makefile | 3 ++- arch/powerpc/math-emu/fre.c | 11 +++++++++++ arch/powerpc/math-emu/frsqrtes.c | 11 +++++++++++ arch/powerpc/math-emu/math.c | 14 ++++++++++---- 4 files changed, 34 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/math-emu/fre.c create mode 100644 arch/powerpc/math-emu/frsqrtes.c diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile index 7d1dba0d57f9..8d035d2d42a6 100644 --- a/arch/powerpc/math-emu/Makefile +++ b/arch/powerpc/math-emu/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_MATH_EMULATION) += fabs.o fadd.o fadds.o fcmpo.o fcmpu.o \ fmadd.o fmadds.o fmsub.o fmsubs.o \ fmul.o fmuls.o fnabs.o fneg.o \ fnmadd.o fnmadds.o fnmsub.o fnmsubs.o \ - fres.o frsp.o frsqrte.o fsel.o lfs.o \ + fres.o fre.o frsp.o fsel.o lfs.o \ + frsqrte.o frsqrtes.o \ fsqrt.o fsqrts.o fsub.o fsubs.o \ mcrfs.o mffs.o mtfsb0.o mtfsb1.o \ mtfsf.o mtfsfi.o stfiwx.o stfs.o \ diff --git a/arch/powerpc/math-emu/fre.c b/arch/powerpc/math-emu/fre.c new file mode 100644 index 000000000000..49ccf2cc6a5a --- /dev/null +++ b/arch/powerpc/math-emu/fre.c @@ -0,0 +1,11 @@ +#include +#include +#include + +int fre(void *frD, void *frB) +{ +#ifdef DEBUG + printk("%s: %p %p\n", __func__, frD, frB); +#endif + return -ENOSYS; +} diff --git a/arch/powerpc/math-emu/frsqrtes.c b/arch/powerpc/math-emu/frsqrtes.c new file mode 100644 index 000000000000..7e838e380314 --- /dev/null +++ b/arch/powerpc/math-emu/frsqrtes.c @@ -0,0 +1,11 @@ +#include +#include +#include + +int frsqrtes(void *frD, void *frB) +{ +#ifdef DEBUG + printk("%s: %p %p\n", __func__, frD, frB); +#endif + return 0; +} diff --git a/arch/powerpc/math-emu/math.c b/arch/powerpc/math-emu/math.c index 164d55935bd8..0328e66e0799 100644 --- a/arch/powerpc/math-emu/math.c +++ b/arch/powerpc/math-emu/math.c @@ -58,8 +58,10 @@ FLOATFUNC(fnabs); FLOATFUNC(fneg); /* Optional */ +FLOATFUNC(fre); FLOATFUNC(fres); FLOATFUNC(frsqrte); +FLOATFUNC(frsqrtes); FLOATFUNC(fsel); FLOATFUNC(fsqrt); FLOATFUNC(fsqrts); @@ -97,6 +99,7 @@ FLOATFUNC(fsqrts); #define FSQRTS 0x016 /* 22 */ #define FRES 0x018 /* 24 */ #define FMULS 0x019 /* 25 */ +#define FRSQRTES 0x01a /* 26 */ #define FMSUBS 0x01c /* 28 */ #define FMADDS 0x01d /* 29 */ #define FNMSUBS 0x01e /* 30 */ @@ -109,6 +112,7 @@ FLOATFUNC(fsqrts); #define FADD 0x015 /* 21 */ #define FSQRT 0x016 /* 22 */ #define FSEL 0x017 /* 23 */ +#define FRE 0x018 /* 24 */ #define FMUL 0x019 /* 25 */ #define FRSQRTE 0x01a /* 26 */ #define FMSUB 0x01c /* 28 */ @@ -299,9 +303,10 @@ do_mathemu(struct pt_regs *regs) case FDIVS: func = fdivs; type = AB; break; case FSUBS: func = fsubs; type = AB; break; case FADDS: func = fadds; type = AB; break; - case FSQRTS: func = fsqrts; type = AB; break; - case FRES: func = fres; type = AB; break; + case FSQRTS: func = fsqrts; type = XB; break; + case FRES: func = fres; type = XB; break; case FMULS: func = fmuls; type = AC; break; + case FRSQRTES: func = frsqrtes;type = XB; break; case FMSUBS: func = fmsubs; type = ABC; break; case FMADDS: func = fmadds; type = ABC; break; case FNMSUBS: func = fnmsubs; type = ABC; break; @@ -317,10 +322,11 @@ do_mathemu(struct pt_regs *regs) case FDIV: func = fdiv; type = AB; break; case FSUB: func = fsub; type = AB; break; case FADD: func = fadd; type = AB; break; - case FSQRT: func = fsqrt; type = AB; break; + case FSQRT: func = fsqrt; type = XB; break; + case FRE: func = fre; type = XB; break; case FSEL: func = fsel; type = ABC; break; case FMUL: func = fmul; type = AC; break; - case FRSQRTE: func = frsqrte; type = AB; break; + case FRSQRTE: func = frsqrte; type = XB; break; case FMSUB: func = fmsub; type = ABC; break; case FMADD: func = fmadd; type = ABC; break; case FNMSUB: func = fnmsub; type = ABC; break; From 4e63f8edfe4d6f20b1af176efc022c2b2f5e7aeb Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 9 Jun 2013 17:01:24 +1000 Subject: [PATCH 038/156] powerpc/math-emu: Allow math-emu to be used for HW FPU (Including 64-bit ones) This allow SW emulation by the kernel of optional instructions such as fsqrt which aren't implemented on some processors, and thus fixes some Fedora 19 issues such as Anaconda since the compiler is set to generate those by default on 64-bit. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 6 +++++- arch/powerpc/kernel/traps.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c33e3ad2c8fd..e3009abc7f75 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -298,7 +298,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE config MATH_EMULATION bool "Math emulation" - depends on 4xx || 8xx || E200 || PPC_MPC832x || E500 + depends on 4xx || 8xx || PPC_MPC832x || BOOKE ---help--- Some PowerPC chips designed for embedded applications do not have a floating-point unit and therefore do not implement the @@ -307,6 +307,10 @@ config MATH_EMULATION unit, which will allow programs that use floating-point instructions to run. + This is also useful to emulate missing (optional) instructions + such as fsqrt on cores that do have an FPU but do not implement + them (such as Freescale BookE). + config PPC_TRANSACTIONAL_MEM bool "Transactional Memory support for POWERPC" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f18c79c324ef..f4b5687b0c66 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1125,7 +1125,17 @@ void __kprobes program_check_exception(struct pt_regs *regs) * ESR_DST (!?) or 0. In the process of chasing this with the * hardware people - not sure if it can happen on any illegal * instruction or only on FP instructions, whether there is a - * pattern to occurrences etc. -dgibson 31/Mar/2003 */ + * pattern to occurrences etc. -dgibson 31/Mar/2003 + */ + + /* + * If we support a HW FPU, we need to ensure the FP state + * if flushed into the thread_struct before attempting + * emulation + */ +#ifdef CONFIG_PPC_FPU + flush_fp_to_thread(current); +#endif switch (do_mathemu(regs)) { case 0: emulate_single_step(regs); From 968219fa334ce8fed3421dd12ea12e9f562c95cb Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 9 Jun 2013 17:04:58 +1000 Subject: [PATCH 039/156] powerpc/8xx: Remove 8xx specific "minimal FPU emulation" This is duplicated code from math-emu and implements such a small subset of the FPU (load/stores/fmr) that it's essentially pointless nowdays. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 11 ----------- arch/powerpc/kernel/traps.c | 22 +--------------------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e3009abc7f75..5374776b4c7c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -319,17 +319,6 @@ config PPC_TRANSACTIONAL_MEM ---help--- Support user-mode Transactional Memory on POWERPC. -config 8XX_MINIMAL_FPEMU - bool "Minimal math emulation for 8xx" - depends on 8xx && !MATH_EMULATION - help - Older arch/ppc kernels still emulated a few floating point - instructions such as load and store, even when full math - emulation is disabled. Say "Y" here if you want to preserve - this behavior. - - It is recommended that you build a soft-float userspace instead. - config IOMMU_HELPER def_bool PPC64 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f4b5687b0c66..071f6e040eb2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1396,8 +1396,7 @@ void performance_monitor_exception(struct pt_regs *regs) void SoftwareEmulation(struct pt_regs *regs) { extern int do_mathemu(struct pt_regs *); - extern int Soft_emulate_8xx(struct pt_regs *); -#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU) +#if defined(CONFIG_MATH_EMULATION) int errcode; #endif @@ -1430,23 +1429,6 @@ void SoftwareEmulation(struct pt_regs *regs) _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); return; } - -#elif defined(CONFIG_8XX_MINIMAL_FPEMU) - errcode = Soft_emulate_8xx(regs); - if (errcode >= 0) - PPC_WARN_EMULATED(8xx, regs); - - switch (errcode) { - case 0: - emulate_single_step(regs); - return; - case 1: - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - return; - case -EFAULT: - _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; - } #else _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); #endif @@ -1796,8 +1778,6 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(unaligned), #ifdef CONFIG_MATH_EMULATION WARN_EMULATED_SETUP(math), -#elif defined(CONFIG_8XX_MINIMAL_FPEMU) - WARN_EMULATED_SETUP(8xx), #endif #ifdef CONFIG_VSX WARN_EMULATED_SETUP(vsx), From 1d25f11fdbcc5390d68efd98c28900bfd29b264c Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 9 Jun 2013 21:23:15 +1000 Subject: [PATCH 040/156] powerpc/tm: Fix writing top half of MSR on 32 bit signals The MSR TM controls are in the top 32 bits of the MSR hence on 32 bit signals, we stick the top half of the MSR in the checkpointed signal context so that the user can access it. Unfortunately, we don't currently write anything to the checkpointed signal context when coming in a from a non transactional process and hence the top MSR bits can contain junk. This updates the 32 bit signal handling code to always write something to the top MSR bits so that users know if the process is transactional or not and the kernel can use it on signal return. Signed-off-by: Michael Neuling cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/signal_32.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 201385c3a1ae..5bc819f50af6 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -407,7 +407,8 @@ inline unsigned long copy_transact_fpr_from_user(struct task_struct *task, * altivec/spe instructions at some point. */ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - int sigret, int ctx_has_vsx_region) + struct mcontext __user *tm_frame, int sigret, + int ctx_has_vsx_region) { unsigned long msr = regs->msr; @@ -475,6 +476,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, if (__put_user(msr, &frame->mc_gregs[PT_MSR])) return 1; + /* We need to write 0 the MSR top 32 bits in the tm frame so that we + * can check it on the restore to see if TM is active + */ + if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR])) + return 1; + if (sigret) { /* Set up the sigreturn trampoline: li r0,sigret; sc */ if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) @@ -952,6 +959,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, { struct rt_sigframe __user *rt_sf; struct mcontext __user *frame; + struct mcontext __user *tm_frame = NULL; void __user *addr; unsigned long newsp = 0; int sigret; @@ -985,23 +993,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_frame = &rt_sf->uc_transact.uc_mcontext; if (MSR_TM_ACTIVE(regs->msr)) { - if (save_tm_user_regs(regs, &rt_sf->uc.uc_mcontext, - &rt_sf->uc_transact.uc_mcontext, sigret)) + if (save_tm_user_regs(regs, frame, tm_frame, sigret)) goto badframe; } else #endif - if (save_user_regs(regs, frame, sigret, 1)) + { + if (save_user_regs(regs, frame, tm_frame, sigret, 1)) goto badframe; + } regs->link = tramp; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (MSR_TM_ACTIVE(regs->msr)) { if (__put_user((unsigned long)&rt_sf->uc_transact, &rt_sf->uc.uc_link) - || __put_user(to_user_ptr(&rt_sf->uc_transact.uc_mcontext), - &rt_sf->uc_transact.uc_regs)) + || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs)) goto badframe; } else @@ -1170,7 +1179,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx, mctx = (struct mcontext __user *) ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) - || save_user_regs(regs, mctx, 0, ctx_has_vsx_region) + || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region) || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) return -EFAULT; @@ -1392,6 +1401,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, { struct sigcontext __user *sc; struct sigframe __user *frame; + struct mcontext __user *tm_mctx = NULL; unsigned long newsp = 0; int sigret; unsigned long tramp; @@ -1425,6 +1435,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_mctx = &frame->mctx_transact; if (MSR_TM_ACTIVE(regs->msr)) { if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, sigret)) @@ -1432,8 +1443,10 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, } else #endif - if (save_user_regs(regs, &frame->mctx, sigret, 1)) + { + if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1)) goto badframe; + } regs->link = tramp; From fee55450710dff32a13ae30b4129ec7b5a4b44d0 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 9 Jun 2013 21:23:16 +1000 Subject: [PATCH 041/156] powerpc/tm: Fix 32 bit non-rt signals Currently sys_sigreturn() is TM unaware. Therefore, if we take a 32 bit signal without SIGINFO (non RT) inside a transaction, on signal return we don't restore the signal frame correctly. This checks if the signal frame being restoring is an active transaction, and if so, it copies the additional state to ptregs so it can be restored. Signed-off-by: Michael Neuling cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/signal_32.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 5bc819f50af6..fa81462f6987 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1494,16 +1494,22 @@ badframe: long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct pt_regs *regs) { + struct sigframe __user *sf; struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; void __user *addr; sigset_t set; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct mcontext __user *mcp, *tm_mcp; + unsigned long msr_hi; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sc = &sf->sctx; addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1520,11 +1526,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, #endif set_current_blocked(&set); - sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); - addr = sr; - if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) - || restore_user_regs(regs, sr, 1)) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + mcp = (struct mcontext __user *)&sf->mctx; + tm_mcp = (struct mcontext __user *)&sf->mctx_transact; + if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR])) goto badframe; + if (MSR_TM_ACTIVE(msr_hi<<32)) { + if (!cpu_has_feature(CPU_FTR_TM)) + goto badframe; + if (restore_tm_user_regs(regs, mcp, tm_mcp)) + goto badframe; + } else +#endif + { + sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); + addr = sr; + if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) + || restore_user_regs(regs, sr, 1)) + goto badframe; + } set_thread_flag(TIF_RESTOREALL); return 0; From 2c27a18f8736da047bef2b997bdd48efc667e3c9 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 9 Jun 2013 21:23:17 +1000 Subject: [PATCH 042/156] powerpc/tm: Fix restoration of MSR on 32bit signal return Currently we clear out the MSR TM bits on signal return assuming that the signal should never return to an active transaction. This is bogus as the user may do this. It's most likely the transaction will be doomed due to a treclaim but that's a problem for the HW not the kernel. The current code is a legacy of earlier kernel implementations which did software rollback of active transactions in the kernel. That code has now gone but we didn't correctly fix up this part of the signals code which still makes the assumption that it must be returning to a suspended transaction. This pulls out both MSR TM bits from the user supplied context rather than just setting TM suspend. We pull out only the bits needed to ensure the user can't do anything dangerous to the MSR. Signed-off-by: Michael Neuling cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/signal_32.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index fa81462f6987..364cb1e7300e 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -754,7 +754,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, struct mcontext __user *tm_sr) { long err; - unsigned long msr; + unsigned long msr, msr_hi; #ifdef CONFIG_VSX int i; #endif @@ -859,8 +859,11 @@ static long restore_tm_user_regs(struct pt_regs *regs, tm_enable(); /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(¤t->thread, msr); - /* The task has moved into TM state S, so ensure MSR reflects this */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | MSR_TS_S; + /* Get the top half of the MSR */ + if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) + return 1; + /* Pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK); /* This loads the speculative FP/VEC state, if used */ if (msr & MSR_FP) { From 55e4341850ac56e63a3eefe9583a9000042164fa Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 9 Jun 2013 21:23:18 +1000 Subject: [PATCH 043/156] powerpc/tm: Fix return of 32bit rt signals to active transactions Currently we only restore signals which are transactionally suspended but it's possible that the transaction can be restored even when it's active. Most likely this will result in a transactional rollback by the hardware as the transaction will have been doomed by an earlier treclaim. The current code is a legacy of earlier kernel implementations which did software rollback of active transactions in the kernel. That code has now gone but we didn't correctly fix up this part of the signals code which still makes assumptions based on having software rollback. This changes the signal return code to always restore both contexts on 32 bit rt signal return. Signed-off-by: Michael Neuling cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/signal_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 364cb1e7300e..0f83122e6676 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1245,7 +1245,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR])) goto bad; - if (MSR_TM_SUSPENDED(msr_hi<<32)) { + if (MSR_TM_ACTIVE(msr_hi<<32)) { /* We only recheckpoint on return if we're * transaction. */ From 87b4e5393af77f5cba124638f19f6c426e210aec Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Sun, 9 Jun 2013 21:23:19 +1000 Subject: [PATCH 044/156] powerpc/tm: Fix return of active 64bit signals Currently we only restore signals which are transactionally suspended but it's possible that the transaction can be restored even when it's active. Most likely this will result in a transactional rollback by the hardware as the transaction will have been doomed by an earlier treclaim. The current code is a legacy of earlier kernel implementations which did software rollback of active transactions in the kernel. That code has now gone but we didn't correctly fix up this part of the signals code which still makes assumptions based on having software rollback. This changes the signal return code to always restore both contexts on 64 bit signal return. It also ensures that the MSR TM bits are properly restored from the signal context which they are not currently. Signed-off-by: Michael Neuling cc: stable@vger.kernel.org (v3.9+) Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/signal_64.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 345947367ec0..887e99d85bc2 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -410,6 +410,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs, /* get MSR separately, transfer the LE bit if doing signal return */ err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + /* pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); /* The following non-GPR non-FPR non-VR state is also checkpointed: */ @@ -505,8 +509,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs, tm_enable(); /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(¤t->thread, msr); - /* The task has moved into TM state S, so ensure MSR reflects this: */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | __MASK(33); /* This loads the speculative FP/VEC state, if used */ if (msr & MSR_FP) { @@ -654,7 +656,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) goto badframe; - if (MSR_TM_SUSPENDED(msr)) { + if (MSR_TM_ACTIVE(msr)) { /* We recheckpoint on return. */ struct ucontext __user *uc_transact; if (__get_user(uc_transact, &uc->uc_link)) From a84f273c30500c0d5816e07232e3326a61956016 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:51 +0800 Subject: [PATCH 045/156] powerpc/eeh: Cleanup for EEH core Cleanup on EEH core to remove unnecessary whitespaces. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/eeh.c | 22 ++++++++++----------- arch/powerpc/platforms/pseries/eeh_driver.c | 14 ++++++------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 6b73d6c44f51..8a8345193083 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -368,7 +368,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) } eeh_stats.slot_resets++; - + /* Avoid repeated reports of this failure, including problems * with other functions on this device, and functions under * bridges. @@ -525,7 +525,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe) * or a fundamental reset (3). * A fundamental reset required by any device under * Partitionable Endpoint trumps hot-reset. - */ + */ eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset); if (freset) @@ -538,8 +538,8 @@ static void eeh_reset_pe_once(struct eeh_pe *pe) */ #define PCI_BUS_RST_HOLD_TIME_MSEC 250 msleep(PCI_BUS_RST_HOLD_TIME_MSEC); - - /* We might get hit with another EEH freeze as soon as the + + /* We might get hit with another EEH freeze as soon as the * pci slot reset line is dropped. Make sure we don't miss * these, and clear the flag now. */ @@ -604,7 +604,7 @@ void eeh_save_bars(struct eeh_dev *edev) if (!edev) return; dn = eeh_dev_to_of_node(edev); - + for (i = 0; i < 16; i++) eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]); } @@ -803,12 +803,12 @@ void eeh_add_device_tree_late(struct pci_bus *bus) struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_add_device_late(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_device_tree_late(subbus); - } + eeh_add_device_late(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_device_tree_late(subbus); + } } } EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index a3fefb61097c..0acc5a2bdfc7 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -154,9 +154,9 @@ static void eeh_enable_irq(struct pci_dev *dev) * eeh_report_error - Report pci error to each device driver * @data: eeh device * @userdata: return value - * - * Report an EEH error to each device driver, collect up and - * merge the device driver responses. Cumulative response + * + * Report an EEH error to each device driver, collect up and + * merge the device driver responses. Cumulative response * passed back in "userdata". */ static void *eeh_report_error(void *data, void *userdata) @@ -376,9 +376,9 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) eeh_pe_restore_bars(pe); /* Give the system 5 seconds to finish running the user-space - * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, - * this is a hack, but if we don't do this, and try to bring - * the device up before the scripts have taken it down, + * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, + * this is a hack, but if we don't do this, and try to bring + * the device up before the scripts have taken it down, * potentially weird things happen. */ if (bus) { @@ -520,7 +520,7 @@ void eeh_handle_event(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); return; - + excess_failures: /* * About 90% of all real-life EEH failures in the field From 317f06de78152e0eb0aab5881d69e4c5cdf9f1fe Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:52 +0800 Subject: [PATCH 046/156] powerpc/eeh: Move common part to kernel directory The patch moves the common part of EEH core into arch/powerpc/kernel directory so that we needn't PPC_PSERIES while compiling POWERNV platform: * Move the EEH common part into arch/powerpc/kernel * Move the functions for PCI hotplug from pSeries platform to arch/powerpc/kernel/pci-hotplug.c * Move CONFIG_EEH from arch/powerpc/platforms/pseries/Kconfig to arch/powerpc/platforms/Kconfig * Adjust makefile accordingly Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/Makefile | 4 +- .../{platforms/pseries => kernel}/eeh.c | 0 .../{platforms/pseries => kernel}/eeh_cache.c | 1 - .../{platforms/pseries => kernel}/eeh_dev.c | 0 .../pseries => kernel}/eeh_driver.c | 1 - .../{platforms/pseries => kernel}/eeh_event.c | 0 .../{platforms/pseries => kernel}/eeh_pe.c | 0 .../{platforms/pseries => kernel}/eeh_sysfs.c | 1 - arch/powerpc/kernel/pci-hotplug.c | 111 ++++++++++++++++++ arch/powerpc/platforms/Kconfig | 5 + arch/powerpc/platforms/pseries/Kconfig | 5 - arch/powerpc/platforms/pseries/Makefile | 4 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 85 -------------- 13 files changed, 120 insertions(+), 97 deletions(-) rename arch/powerpc/{platforms/pseries => kernel}/eeh.c (100%) rename arch/powerpc/{platforms/pseries => kernel}/eeh_cache.c (99%) rename arch/powerpc/{platforms/pseries => kernel}/eeh_dev.c (100%) rename arch/powerpc/{platforms/pseries => kernel}/eeh_driver.c (99%) rename arch/powerpc/{platforms/pseries => kernel}/eeh_event.c (100%) rename arch/powerpc/{platforms/pseries => kernel}/eeh_pe.c (100%) rename arch/powerpc/{platforms/pseries => kernel}/eeh_sysfs.c (99%) create mode 100644 arch/powerpc/kernel/pci-hotplug.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f960a7944553..a8619bfe879e 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -58,6 +58,8 @@ obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o +obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ + eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o @@ -100,7 +102,7 @@ obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o -pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o +pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ pci-common.o pci_of_scan.o obj-$(CONFIG_PCI_MSI) += msi.o diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/kernel/eeh.c similarity index 100% rename from arch/powerpc/platforms/pseries/eeh.c rename to arch/powerpc/kernel/eeh.c diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c similarity index 99% rename from arch/powerpc/platforms/pseries/eeh_cache.c rename to arch/powerpc/kernel/eeh_cache.c index 5a4c87903057..1d5d9a6ba740 100644 --- a/arch/powerpc/platforms/pseries/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -316,4 +316,3 @@ void __init eeh_addr_cache_build(void) eeh_addr_cache_print(&pci_io_addr_cache_root); #endif } - diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c similarity index 100% rename from arch/powerpc/platforms/pseries/eeh_dev.c rename to arch/powerpc/kernel/eeh_dev.c diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c similarity index 99% rename from arch/powerpc/platforms/pseries/eeh_driver.c rename to arch/powerpc/kernel/eeh_driver.c index 0acc5a2bdfc7..fb927af9a9ef 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -549,4 +549,3 @@ perm_error: if (frozen_bus) pcibios_remove_pci_devices(frozen_bus); } - diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/kernel/eeh_event.c similarity index 100% rename from arch/powerpc/platforms/pseries/eeh_event.c rename to arch/powerpc/kernel/eeh_event.c diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c similarity index 100% rename from arch/powerpc/platforms/pseries/eeh_pe.c rename to arch/powerpc/kernel/eeh_pe.c diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c similarity index 99% rename from arch/powerpc/platforms/pseries/eeh_sysfs.c rename to arch/powerpc/kernel/eeh_sysfs.c index d37708360f2e..e7ae3484918c 100644 --- a/arch/powerpc/platforms/pseries/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -72,4 +72,3 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); } - diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c new file mode 100644 index 000000000000..3f608800c06b --- /dev/null +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -0,0 +1,111 @@ +/* + * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c" + * + * Copyright (C) 2003 Linda Xie + * Copyright (C) 2005 International Business Machines + * + * Updates, 2005, John Rose + * Updates, 2005, Linas Vepstas + * Updates, 2013, Gavin Shan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +/** + * __pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * @purge_pe: destroy the PE on removal of PCI devices + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + * By default, the corresponding PE will be destroied during the + * normal PCI hotplug path. For PCI hotplug during EEH recovery, + * the corresponding PE won't be destroied and deallocated. + */ +void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe) +{ + struct pci_dev *dev, *tmp; + struct pci_bus *child_bus; + + /* First go down child busses */ + list_for_each_entry(child_bus, &bus->children, node) + __pcibios_remove_pci_devices(child_bus, purge_pe); + + pr_debug("PCI: Removing devices on bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { + pr_debug(" * Removing %s...\n", pci_name(dev)); + eeh_remove_bus_device(dev, purge_pe); + pci_stop_and_remove_bus_device(dev); + } +} + +/** + * pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + */ +void pcibios_remove_pci_devices(struct pci_bus *bus) +{ + __pcibios_remove_pci_devices(bus, 1); +} +EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); + +/** + * pcibios_add_pci_devices - adds new pci devices to bus + * @bus: the indicated PCI bus + * + * This routine will find and fixup new pci devices under + * the indicated bus. This routine presumes that there + * might already be some devices under this bridge, so + * it carefully tries to add only new devices. (And that + * is how this routine differs from other, similar pcibios + * routines.) + */ +void pcibios_add_pci_devices(struct pci_bus * bus) +{ + int slotno, num, mode, pass, max; + struct pci_dev *dev; + struct device_node *dn = pci_bus_to_OF_node(bus); + + eeh_add_device_tree_early(dn); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + + if (mode == PCI_PROBE_DEVTREE) { + /* use ofdt-based probe */ + of_rescan_bus(dn, bus); + } else if (mode == PCI_PROBE_NORMAL) { + /* use legacy probe */ + slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); + num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); + if (!num) + return; + pcibios_setup_bus_devices(bus); + max = bus->busn_res.start; + for (pass = 0; pass < 2; pass++) { + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + max = pci_scan_bridge(bus, dev, + max, pass); + } + } + } + pcibios_finish_adding_to_bus(bus); +} +EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index b62aab3e22ec..bed8c607588c 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -164,6 +164,11 @@ config IBMEBUS help Bus device driver for GX bus based adapters. +config EEH + bool + depends on (PPC_POWERNV || PPC_PSERIES) && PCI + default y + config PPC_MPC106 bool default n diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 4459eff7a75a..1bd3399146ed 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -33,11 +33,6 @@ config PPC_SPLPAR processors, that is, which share physical processors between two or more partitions. -config EEH - bool - depends on PPC_PSERIES && PCI - default y - config PSERIES_MSI bool depends on PCI_MSI && EEH diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 53866e537a92..8ae010381316 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -6,9 +6,7 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ firmware.o power.o dlpar.o mobility.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SCANLOG) += scanlog.o -obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ - eeh_driver.o eeh_event.o eeh_sysfs.o \ - eeh_pseries.o +obj-$(CONFIG_EEH) += eeh_pseries.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_PCI) += pci.o pci_dlpar.o obj-$(CONFIG_PSERIES_MSI) += msi.o diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index c91b22be9288..efe61374f6ea 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -64,91 +64,6 @@ pcibios_find_pci_bus(struct device_node *dn) } EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); -/** - * __pcibios_remove_pci_devices - remove all devices under this bus - * @bus: the indicated PCI bus - * @purge_pe: destroy the PE on removal of PCI devices - * - * Remove all of the PCI devices under this bus both from the - * linux pci device tree, and from the powerpc EEH address cache. - * By default, the corresponding PE will be destroied during the - * normal PCI hotplug path. For PCI hotplug during EEH recovery, - * the corresponding PE won't be destroied and deallocated. - */ -void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe) -{ - struct pci_dev *dev, *tmp; - struct pci_bus *child_bus; - - /* First go down child busses */ - list_for_each_entry(child_bus, &bus->children, node) - __pcibios_remove_pci_devices(child_bus, purge_pe); - - pr_debug("PCI: Removing devices on bus %04x:%02x\n", - pci_domain_nr(bus), bus->number); - list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { - pr_debug(" * Removing %s...\n", pci_name(dev)); - eeh_remove_bus_device(dev, purge_pe); - pci_stop_and_remove_bus_device(dev); - } -} - -/** - * pcibios_remove_pci_devices - remove all devices under this bus - * - * Remove all of the PCI devices under this bus both from the - * linux pci device tree, and from the powerpc EEH address cache. - */ -void pcibios_remove_pci_devices(struct pci_bus *bus) -{ - __pcibios_remove_pci_devices(bus, 1); -} -EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); - -/** - * pcibios_add_pci_devices - adds new pci devices to bus - * - * This routine will find and fixup new pci devices under - * the indicated bus. This routine presumes that there - * might already be some devices under this bridge, so - * it carefully tries to add only new devices. (And that - * is how this routine differs from other, similar pcibios - * routines.) - */ -void pcibios_add_pci_devices(struct pci_bus * bus) -{ - int slotno, num, mode, pass, max; - struct pci_dev *dev; - struct device_node *dn = pci_bus_to_OF_node(bus); - - eeh_add_device_tree_early(dn); - - mode = PCI_PROBE_NORMAL; - if (ppc_md.pci_probe_mode) - mode = ppc_md.pci_probe_mode(bus); - - if (mode == PCI_PROBE_DEVTREE) { - /* use ofdt-based probe */ - of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL) { - /* use legacy probe */ - slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); - num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - if (!num) - return; - pcibios_setup_bus_devices(bus); - max = bus->busn_res.start; - for (pass=0; pass < 2; pass++) - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) - max = pci_scan_bridge(bus, dev, max, pass); - } - } - pcibios_finish_adding_to_bus(bus); -} -EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); - struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; From 9ff67433cef319a02cb64dbe3f710a8566ebfcee Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:53 +0800 Subject: [PATCH 047/156] powerpc/eeh: Make eeh_phb_pe_get() public One of the possible cases indicated by P7IOC interrupt is fenced PHB. For that case, we need fetch the PE corresponding to the PHB and disable the PHB and all subordinate PCI buses/devices, recover from the fenced state and eventually enable the whole PHB. We need one function to fetch the PHB PE outside eeh_pe.c and the patch is going to make eeh_phb_pe_get() public for that purpose. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh_pe.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index e32c3c53eb8b..4ac6f70025b4 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -184,6 +184,7 @@ static inline void eeh_unlock(void) typedef void *(*eeh_traverse_func)(void *data, void *flag); int eeh_phb_pe_create(struct pci_controller *phb); +struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); int eeh_add_to_parent_pe(struct eeh_dev *edev); int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); void *eeh_pe_dev_traverse(struct eeh_pe *root, diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 9d4a9e8562b2..71c4544453cf 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -95,7 +95,7 @@ int eeh_phb_pe_create(struct pci_controller *phb) * hierarchy tree is composed of PHB PEs. The function is used * to retrieve the corresponding PHB PE according to the given PHB. */ -static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) +struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) { struct eeh_pe *pe; From 01566808547b7ecc5017a741d50dead9e53f86fa Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:54 +0800 Subject: [PATCH 048/156] powerpc/eeh: Make eeh_pe_get() public While processing EEH event interrupt from P7IOC, we need function to retrieve the PE according to the indicated EEH device. The patch makes function eeh_pe_get() public so that other source files can call it for that purpose. Also, the patch fixes referring to wrong BDF (Bus/Device/Function) address while searching PE in function __eeh_pe_get(). Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh_pe.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 4ac6f70025b4..acdfcaaf5c36 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -185,6 +185,7 @@ static inline void eeh_unlock(void) typedef void *(*eeh_traverse_func)(void *data, void *flag); int eeh_phb_pe_create(struct pci_controller *phb); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); +struct eeh_pe *eeh_pe_get(struct eeh_dev *edev); int eeh_add_to_parent_pe(struct eeh_dev *edev); int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); void *eeh_pe_dev_traverse(struct eeh_pe *root, diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 71c4544453cf..3d2dcf5afc29 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -228,7 +228,7 @@ static void *__eeh_pe_get(void *data, void *flag) return pe; /* Try BDF address */ - if (edev->pe_config_addr && + if (edev->config_addr && (edev->config_addr == pe->config_addr)) return pe; @@ -246,7 +246,7 @@ static void *__eeh_pe_get(void *data, void *flag) * which is composed of PCI bus/device/function number, or unified * PE address. */ -static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) { struct eeh_pe *root = eeh_phb_pe_get(edev->phb); struct eeh_pe *pe; From 8cdb283371241d298332c44d7c722e26d9858ec1 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:55 +0800 Subject: [PATCH 049/156] powerpc/eeh: Trace PCI bus from PE There're several types of PEs can be supported for now: PHB, Bus and Device dependent PE. For PCI bus dependent PE, tracing the corresponding PCI bus from PE (struct eeh_pe) would make the code more efficient. The patch also enables the retrieval of PCI bus based on the PCI bus dependent PE. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh_pe.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index acdfcaaf5c36..f3b49d6d5259 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -59,6 +59,7 @@ struct eeh_pe { int config_addr; /* Traditional PCI address */ int addr; /* PE configuration address */ struct pci_controller *phb; /* Associated PHB */ + struct pci_bus *bus; /* Top PCI bus for bus PE */ int check_count; /* Times of ignored error */ int freeze_count; /* Times of froze up */ int false_positives; /* Times of reported #ff's */ diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 3d2dcf5afc29..c96366758acf 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -364,6 +364,17 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pe->addr = edev->pe_config_addr; pe->config_addr = edev->config_addr; + /* + * While doing PE reset, we probably hot-reset the + * upstream bridge. However, the PCI devices including + * the associated EEH devices might be removed when EEH + * core is doing recovery. So that won't safe to retrieve + * the bridge through downstream EEH device. We have to + * trace the parent PCI bus, then the upstream bridge. + */ + if (eeh_probe_mode_dev()) + pe->bus = eeh_dev_to_pci_dev(edev)->bus; + /* * Put the new EEH PE into hierarchy tree. If the parent * can't be found, the newly created PE will be attached @@ -641,12 +652,18 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) bus = pe->phb->bus; } else if (pe->type & EEH_PE_BUS || pe->type & EEH_PE_DEVICE) { + if (pe->bus) { + bus = pe->bus; + goto out; + } + edev = list_first_entry(&pe->edevs, struct eeh_dev, list); pdev = eeh_dev_to_pci_dev(edev); if (pdev) bus = pdev->bus; } +out: eeh_unlock(); return bus; From 51fb5f563274de01e47ad2cbdd48018557926fe3 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:56 +0800 Subject: [PATCH 050/156] powerpc/eeh: Make eeh_init() public For EEH on PowerNV platform, we will do EEH probe based on the real PCI devices. The PCI devices are available after PCI probe. So we have to call eeh_init() explicitly on PowerNV platform after PCI probe. The patch also does EEH probe for PowerNV platform in eeh_init(). Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 8 +++++++- arch/powerpc/kernel/eeh.c | 22 ++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index f3b49d6d5259..beb3cbcb76f8 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -132,7 +132,7 @@ struct eeh_ops { char *name; int (*init)(void); void* (*of_probe)(struct device_node *dn, void *flag); - void* (*dev_probe)(struct pci_dev *dev, void *flag); + int (*dev_probe)(struct pci_dev *dev, void *flag); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); int (*get_state)(struct eeh_pe *pe, int *state); @@ -196,6 +196,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); void *eeh_dev_init(struct device_node *dn, void *data); void eeh_dev_phb_init_dynamic(struct pci_controller *phb); +int __init eeh_init(void); int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); unsigned long eeh_check_failure(const volatile void __iomem *token, @@ -224,6 +225,11 @@ void eeh_remove_bus_device(struct pci_dev *, int); #else /* !CONFIG_EEH */ +static inline int eeh_init(void) +{ + return 0; +} + static inline void *eeh_dev_init(struct device_node *dn, void *data) { return NULL; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 8a8345193083..c865c5f54b18 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -674,11 +674,21 @@ int __exit eeh_ops_unregister(const char *name) * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -static int __init eeh_init(void) +int __init eeh_init(void) { struct pci_controller *hose, *tmp; struct device_node *phb; - int ret; + static int cnt = 0; + int ret = 0; + + /* + * We have to delay the initialization on PowerNV after + * the PCI hierarchy tree has been built because the PEs + * are figured out based on PCI devices instead of device + * tree nodes + */ + if (machine_is(powernv) && cnt++ <= 0) + return ret; /* call platform initialization function */ if (!eeh_ops) { @@ -700,6 +710,14 @@ static int __init eeh_init(void) phb = hose->dn; traverse_pci_devices(phb, eeh_ops->of_probe, NULL); } + } else if (eeh_probe_mode_dev()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) + pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL); + } else { + pr_warning("%s: Invalid probe mode %d\n", + __func__, eeh_probe_mode); + return -EINVAL; } if (eeh_subsystem_enabled) From 21fd21f59082c2538883a280e7f0e9b374cf6cec Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:57 +0800 Subject: [PATCH 051/156] powerpc/eeh: EEH post initialization operation The patch adds new EEH operation post_init. It's used to notify the platform that EEH core has completed the EEH probe. By that, PowerNV platform starts to use the services supplied by EEH functionality. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index beb3cbcb76f8..beec7883d93e 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -131,6 +131,7 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev) struct eeh_ops { char *name; int (*init)(void); + int (*post_init)(void); void* (*of_probe)(struct device_node *dn, void *flag); int (*dev_probe)(struct pci_dev *dev, void *flag); int (*set_option)(struct eeh_pe *pe, int option); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c865c5f54b18..a29cf473fadf 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -720,6 +720,17 @@ int __init eeh_init(void) return -EINVAL; } + /* + * Call platform post-initialization. Actually, It's good chance + * to inform platform that EEH is ready to supply service if the + * I/O cache stuff has been built up. + */ + if (eeh_ops->post_init) { + ret = eeh_ops->post_init(); + if (ret) + return ret; + } + if (eeh_subsystem_enabled) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); else From 326a98ea93b5d2bbf53db5c25533ca3a7c4cce65 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:58 +0800 Subject: [PATCH 052/156] powerpc/eeh: Refactor eeh_reset_pe_once() We shouldn't check that the returned PE status is exactly equal to (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) but instead only check that they are both set. [benh: changelog] Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index a29cf473fadf..cda0b62d99f5 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -565,6 +565,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe) */ int eeh_reset_pe(struct eeh_pe *pe) { + int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); int i, rc; /* Take three shots at resetting the bus */ @@ -572,7 +573,7 @@ int eeh_reset_pe(struct eeh_pe *pe) eeh_reset_pe_once(pe); rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); - if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) + if ((rc & flags) == flags) return 0; if (rc < 0) { From 26a74850b35d85a81155aa0e51211fbd6eecad25 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:20:59 +0800 Subject: [PATCH 053/156] powerpc/eeh: Delay EEH probe during hotplug While doing EEH recovery, the PCI devices of the problematic PE should be removed and then added to the system again. During the so-called hotplug event, the PCI devices of the problematic PE will be probed through early/late phase. We would delay EEH probe on late point for PowerNV platform since the PCI device isn't available in early phase. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index cda0b62d99f5..7d169d35b5b0 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -758,6 +758,14 @@ static void eeh_add_device_early(struct device_node *dn) { struct pci_controller *phb; + /* + * If we're doing EEH probe based on PCI device, we + * would delay the probe until late stage because + * the PCI device isn't available this moment. + */ + if (!eeh_probe_mode_devtree()) + return; + if (!of_node_to_eeh_dev(dn)) return; phb = of_node_to_eeh_dev(dn)->phb; @@ -766,7 +774,6 @@ static void eeh_add_device_early(struct device_node *dn) if (NULL == phb || 0 == phb->buid) return; - /* FIXME: hotplug support on POWERNV */ eeh_ops->of_probe(dn, NULL); } @@ -817,6 +824,13 @@ static void eeh_add_device_late(struct pci_dev *dev) edev->pdev = dev; dev->dev.archdata.edev = edev; + /* + * We have to do the EEH probe here because the PCI device + * hasn't been created yet in the early stage. + */ + if (eeh_probe_mode_dev()) + eeh_ops->dev_probe(dev, NULL); + eeh_addr_cache_insert_dev(dev); } From c86085580d5f60d2d3cea9c60d50e284558d3de7 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:00 +0800 Subject: [PATCH 054/156] powerpc/eeh: Single kthread to handle events We possiblly have multiple kthreads running for multiple EEH errors (events) and use one spinlock to make the process of handling those EEH events serialized. That's unnecessary and the patch creates only one kthread, which is started during EEH core initialization time in eeh_init(). A new semaphore introduced to count the number of existing EEH events in the queue and the kthread waiting on the semaphore. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh_event.h | 1 + arch/powerpc/kernel/eeh.c | 5 ++ arch/powerpc/kernel/eeh_event.c | 90 ++++++++++++++-------------- 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index de67d830151b..de92c86221e7 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -31,6 +31,7 @@ struct eeh_event { struct eeh_pe *pe; /* EEH PE */ }; +int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); void eeh_handle_event(struct eeh_pe *pe); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 7d169d35b5b0..777ecc06af19 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -704,6 +704,11 @@ int __init eeh_init(void) raw_spin_lock_init(&confirm_error_lock); + /* Initialize EEH event */ + ret = eeh_event_init(); + if (ret) + return ret; + /* Enable EEH for all adapters */ if (eeh_probe_mode_devtree()) { list_for_each_entry_safe(hose, tmp, diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 185bedd926df..62e532ddfb68 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -18,11 +18,10 @@ #include #include -#include #include +#include #include #include -#include #include #include #include @@ -35,14 +34,9 @@ * work-queue, where a worker thread can drive recovery. */ -/* EEH event workqueue setup. */ static DEFINE_SPINLOCK(eeh_eventlist_lock); +static struct semaphore eeh_eventlist_sem; LIST_HEAD(eeh_eventlist); -static void eeh_thread_launcher(struct work_struct *); -DECLARE_WORK(eeh_event_wq, eeh_thread_launcher); - -/* Serialize reset sequences for a given pci device */ -DEFINE_MUTEX(eeh_event_mutex); /** * eeh_event_handler - Dispatch EEH events. @@ -60,55 +54,62 @@ static int eeh_event_handler(void * dummy) struct eeh_event *event; struct eeh_pe *pe; - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; + while (!kthread_should_stop()) { + down(&eeh_eventlist_sem); - /* Unqueue the event, get ready to process. */ - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + /* Fetch EEH event from the queue */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, + struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (!event) + continue; - if (event == NULL) - return 0; - - /* Serialize processing of EEH events */ - mutex_lock(&eeh_event_mutex); - pe = event->pe; - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); - pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n", - pe->phb->global_number, pe->addr); - - set_current_state(TASK_INTERRUPTIBLE); /* Don't add to load average */ - eeh_handle_event(pe); - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); - - kfree(event); - mutex_unlock(&eeh_event_mutex); - - /* If there are no new errors after an hour, clear the counter. */ - if (pe && pe->freeze_count > 0) { - msleep_interruptible(3600*1000); - if (pe->freeze_count > 0) - pe->freeze_count--; + /* We might have event without binding PE */ + pe = event->pe; + if (pe) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n", + pe->phb->global_number, pe->addr); + eeh_handle_event(pe); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + } else { + eeh_handle_event(NULL); + } + kfree(event); } return 0; } /** - * eeh_thread_launcher - Start kernel thread to handle EEH events - * @dummy - unused + * eeh_event_init - Start kernel thread to handle EEH events * * This routine is called to start the kernel thread for processing * EEH event. */ -static void eeh_thread_launcher(struct work_struct *dummy) +int eeh_event_init(void) { - if (IS_ERR(kthread_run(eeh_event_handler, NULL, "eehd"))) - printk(KERN_ERR "Failed to start EEH daemon\n"); + struct task_struct *t; + int ret = 0; + + /* Initialize semaphore */ + sema_init(&eeh_eventlist_sem, 0); + + t = kthread_run(eeh_event_handler, NULL, "eehd"); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + pr_err("%s: Failed to start EEH daemon (%d)\n", + __func__, ret); + return ret; + } + + return 0; } /** @@ -136,7 +137,8 @@ int eeh_send_failure_event(struct eeh_pe *pe) list_add(&event->list, &eeh_eventlist); spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - schedule_work(&eeh_event_wq); + /* For EEH deamon to knick in */ + up(&eeh_eventlist_sem); return 0; } From 5a71978e4b6ee6a01bc6aab926a3571055123029 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:01 +0800 Subject: [PATCH 055/156] powerpc/eeh: Trace time on first error for PE We're not expecting that one specific PE got frozen for over 5 times in last hour. Otherwise, the PE will be removed from the system upon newly coming EEH errors. The patch introduces time stamp to trace the first error on specific PE in last hour and function to update that accordingly. Besides, the time stamp is recovered during PE hotplug path as we did for frozen count. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 3 +++ arch/powerpc/kernel/eeh_driver.c | 5 +++++ arch/powerpc/kernel/eeh_pe.c | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index beec7883d93e..e1109fd87ff4 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -24,6 +24,7 @@ #include #include #include +#include struct pci_dev; struct pci_bus; @@ -62,6 +63,7 @@ struct eeh_pe { struct pci_bus *bus; /* Top PCI bus for bus PE */ int check_count; /* Times of ignored error */ int freeze_count; /* Times of froze up */ + struct timeval tstamp; /* Time on first-time freeze */ int false_positives; /* Times of reported #ff's */ struct eeh_pe *parent; /* Parent PE */ struct list_head child_list; /* Link PE to the child list */ @@ -190,6 +192,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_get(struct eeh_dev *edev); int eeh_add_to_parent_pe(struct eeh_dev *edev); int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); +void eeh_pe_update_time_stamp(struct eeh_pe *pe); void *eeh_pe_dev_traverse(struct eeh_pe *root, eeh_traverse_func fn, void *flag); void eeh_pe_restore_bars(struct eeh_pe *pe); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index fb927af9a9ef..678bc6cddf82 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata) */ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) { + struct timeval tstamp; int cnt, rc; /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; + tstamp = pe->tstamp; /* * We don't remove the corresponding PE instances because @@ -385,6 +387,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) ssleep(5); pcibios_add_pci_devices(bus); } + + pe->tstamp = tstamp; pe->freeze_count = cnt; return 0; @@ -425,6 +429,7 @@ void eeh_handle_event(struct eeh_pe *pe) return; } + eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index c96366758acf..ae75722c0583 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -481,6 +481,33 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe) return 0; } +/** + * eeh_pe_update_time_stamp - Update PE's frozen time stamp + * @pe: EEH PE + * + * We have time stamp for each PE to trace its time of getting + * frozen in last hour. The function should be called to update + * the time stamp on first error of the specific PE. On the other + * handle, we needn't account for errors happened in last hour. + */ +void eeh_pe_update_time_stamp(struct eeh_pe *pe) +{ + struct timeval tstamp; + + if (!pe) return; + + if (pe->freeze_count <= 0) { + pe->freeze_count = 0; + do_gettimeofday(&pe->tstamp); + } else { + do_gettimeofday(&tstamp); + if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + pe->tstamp = tstamp; + pe->freeze_count = 0; + } + } +} + /** * __eeh_pe_state_mark - Mark the state for the PE * @data: EEH PE From 99866595340fa24079f61962b2541db3225e7aad Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:02 +0800 Subject: [PATCH 056/156] powerpc/eeh: Allow to purge EEH events On PowerNV platform, we might run into the situation where subsequent events are duplicated events of former one, which is being processed. For the case, we need the function implemented by the patch to purge EEH events accordingly. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh_event.h | 1 + arch/powerpc/kernel/eeh_event.c | 37 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index de92c86221e7..89d5670b2eeb 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -33,6 +33,7 @@ struct eeh_event { int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); +void eeh_remove_event(struct eeh_pe *pe); void eeh_handle_event(struct eeh_pe *pe); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 62e532ddfb68..39bcd81e7f5d 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -142,3 +142,40 @@ int eeh_send_failure_event(struct eeh_pe *pe) return 0; } + +/** + * eeh_remove_event - Remove EEH event from the queue + * @pe: Event binding to the PE + * + * On PowerNV platform, we might have subsequent coming events + * is part of the former one. For that case, those subsequent + * coming events are totally duplicated and unnecessary, thus + * they should be removed. + */ +void eeh_remove_event(struct eeh_pe *pe) +{ + unsigned long flags; + struct eeh_event *event, *tmp; + + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { + /* + * If we don't have valid PE passed in, that means + * we already have event corresponding to dead IOC + * and all events should be purged. + */ + if (!pe) { + list_del(&event->list); + kfree(event); + } else if (pe->type & EEH_PE_PHB) { + if (event->pe && event->pe->phb == pe->phb) { + list_del(&event->list); + kfree(event); + } + } else if (event->pe == pe) { + list_del(&event->list); + kfree(event); + } + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); +} From 4907581dc21f43f94d3a15dd98f62a8f936b3050 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:03 +0800 Subject: [PATCH 057/156] powerpc/eeh: Export confirm_error_lock An EEH event is created and queued to the event queue for each ingress EEH error. When there're mutiple EEH errors, we need serialize the process to keep consistent PE state (flags). The spinlock "confirm_error_lock" was introduced for the purpose. We'll inject EEH event upon error reporting interrupts on PowerNV platform. So we export the spinlock for that to use for consistent PE state. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 11 +++++++++++ arch/powerpc/kernel/eeh.c | 10 ++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index e1109fd87ff4..0c0ac93f422f 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -150,6 +150,7 @@ struct eeh_ops { extern struct eeh_ops *eeh_ops; extern int eeh_subsystem_enabled; extern struct mutex eeh_mutex; +extern raw_spinlock_t confirm_error_lock; extern int eeh_probe_mode; #define EEH_PROBE_MODE_DEV (1<<0) /* From PCI device */ @@ -180,6 +181,16 @@ static inline void eeh_unlock(void) mutex_unlock(&eeh_mutex); } +static inline void eeh_serialize_lock(unsigned long *flags) +{ + raw_spin_lock_irqsave(&confirm_error_lock, *flags); +} + +static inline void eeh_serialize_unlock(unsigned long flags) +{ + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); +} + /* * Max number of EEH freezes allowed before we consider the device * to be permanently disabled. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 777ecc06af19..81cd0311dee8 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -107,7 +107,7 @@ int eeh_probe_mode; DEFINE_MUTEX(eeh_mutex); /* Lock to avoid races due to multiple reports of an error */ -static DEFINE_RAW_SPINLOCK(confirm_error_lock); +DEFINE_RAW_SPINLOCK(confirm_error_lock); /* Buffer for reporting pci register dumps. Its here in BSS, and * not dynamically alloced, so that it ends up in RMO where RTAS @@ -325,7 +325,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ - raw_spin_lock_irqsave(&confirm_error_lock, flags); + eeh_serialize_lock(&flags); rc = 1; if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; @@ -374,7 +374,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * bridges. */ eeh_pe_state_mark(pe, EEH_PE_ISOLATED); - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); + eeh_serialize_unlock(flags); eeh_send_failure_event(pe); @@ -386,7 +386,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) return 1; dn_unlock: - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); + eeh_serialize_unlock(flags); return rc; } @@ -702,8 +702,6 @@ int __init eeh_init(void) return ret; } - raw_spin_lock_init(&confirm_error_lock); - /* Initialize EEH event */ ret = eeh_event_init(); if (ret) From 8a6b1bc70dbb538cb8a39e8c5be9c3dfd7b1f40e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:04 +0800 Subject: [PATCH 058/156] powerpc/eeh: EEH core to handle special event On PowerNV platform, the EEH event caused by interrupt won't have binding PE. The patch enables EEH core to handle the special event. To avoid the current logic we have, The eeh_handle_event() is renamed to eeh_handle_normal_event(), and the eeh_handle_special_event() is introduced. The function eeh_handle_event() dispatches to above two functions according to the input parameter. Besides, new backend "next_error" added to eeh_ops and it's expected to have following return values: 4 - Dead IOC 3 - Dead PHB 2 - Fenced PHB 1 - Frozen PE 0 - No error found Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 2 + arch/powerpc/kernel/eeh_driver.c | 128 ++++++++++++++++++++++++++----- 2 files changed, 112 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 0c0ac93f422f..a0b11fb3237e 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -53,6 +53,7 @@ struct device_node; #define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */ #define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */ +#define EEH_PE_PHB_DEAD (1 << 2) /* Dead PHB */ struct eeh_pe { int type; /* PE type: PHB/Bus/Device */ @@ -145,6 +146,7 @@ struct eeh_ops { int (*configure_bridge)(struct eeh_pe *pe); int (*read_config)(struct device_node *dn, int where, int size, u32 *val); int (*write_config)(struct device_node *dn, int where, int size, u32 val); + int (*next_error)(struct eeh_pe **pe); }; extern struct eeh_ops *eeh_ops; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 678bc6cddf82..0974e1326842 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) */ #define MAX_WAIT_FOR_RECOVERY 150 -/** - * eeh_handle_event - Reset a PCI device after hard lockup. - * @pe: EEH PE - * - * While PHB detects address or data parity errors on particular PCI - * slot, the associated PE will be frozen. Besides, DMA's occurring - * to wild addresses (which usually happen due to bugs in device - * drivers or in PCI adapter firmware) can cause EEH error. #SERR, - * #PERR or other misc PCI-related errors also can trigger EEH errors. - * - * Recovery process consists of unplugging the device driver (which - * generated hotplug events to userspace), then issuing a PCI #RST to - * the device, then reconfiguring the PCI config space for all bridges - * & devices under this slot, and then finally restarting the device - * drivers (which cause a second set of hotplug events to go out to - * userspace). - */ -void eeh_handle_event(struct eeh_pe *pe) +static void eeh_handle_normal_event(struct eeh_pe *pe) { struct pci_bus *frozen_bus; int rc = 0; @@ -554,3 +537,112 @@ perm_error: if (frozen_bus) pcibios_remove_pci_devices(frozen_bus); } + +static void eeh_handle_special_event(void) +{ + struct eeh_pe *pe, *phb_pe; + struct pci_bus *bus; + struct pci_controller *hose, *tmp; + unsigned long flags; + int rc = 0; + + /* + * The return value from next_error() has been classified as follows. + * It might be good to enumerate them. However, next_error() is only + * supported by PowerNV platform for now. So it would be fine to use + * integer directly: + * + * 4 - Dead IOC 3 - Dead PHB + * 2 - Fenced PHB 1 - Frozen PE + * 0 - No error found + * + */ + rc = eeh_ops->next_error(&pe); + if (rc <= 0) + return; + + switch (rc) { + case 4: + /* Mark all PHBs in dead state */ + eeh_serialize_lock(&flags); + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) continue; + + eeh_pe_state_mark(phb_pe, + EEH_PE_ISOLATED | EEH_PE_PHB_DEAD); + } + eeh_serialize_unlock(flags); + + /* Purge all events */ + eeh_remove_event(NULL); + break; + case 3: + case 2: + case 1: + /* Mark the PE in fenced state */ + eeh_serialize_lock(&flags); + if (rc == 3) + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_PHB_DEAD); + else + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_RECOVERING); + eeh_serialize_unlock(flags); + + /* Purge all events of the PHB */ + eeh_remove_event(pe); + break; + default: + pr_err("%s: Invalid value %d from next_error()\n", + __func__, rc); + return; + } + + /* + * For fenced PHB and frozen PE, it's handled as normal + * event. We have to remove the affected PHBs for dead + * PHB and IOC + */ + if (rc == 2 || rc == 1) + eeh_handle_normal_event(pe); + else { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD)) + continue; + + bus = eeh_pe_bus_get(phb_pe); + /* Notify all devices that they're about to go down. */ + eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); + pcibios_remove_pci_devices(bus); + } + } +} + +/** + * eeh_handle_event - Reset a PCI device after hard lockup. + * @pe: EEH PE + * + * While PHB detects address or data parity errors on particular PCI + * slot, the associated PE will be frozen. Besides, DMA's occurring + * to wild addresses (which usually happen due to bugs in device + * drivers or in PCI adapter firmware) can cause EEH error. #SERR, + * #PERR or other misc PCI-related errors also can trigger EEH errors. + * + * Recovery process consists of unplugging the device driver (which + * generated hotplug events to userspace), then issuing a PCI #RST to + * the device, then reconfiguring the PCI config space for all bridges + * & devices under this slot, and then finally restarting the device + * drivers (which cause a second set of hotplug events to go out to + * userspace). + */ +void eeh_handle_event(struct eeh_pe *pe) +{ + if (pe) + eeh_handle_normal_event(pe); + else + eeh_handle_special_event(); +} From 23773230c823cf79415a436aa26009025008fef5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:05 +0800 Subject: [PATCH 059/156] powerpc/eeh: Sync OPAL API with firmware The patch synchronizes OPAL APIs between kernel and firmware. Also, we starts to replace opal_pci_get_phb_diag_data() with the similar opal_pci_get_phb_diag_data2() and the former OPAL API would return OPAL_UNSUPPORTED from now on. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 135 +++++++++++++++--- .../powerpc/platforms/powernv/opal-wrappers.S | 3 + arch/powerpc/platforms/powernv/pci.c | 3 +- 3 files changed, 119 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index cbb9305ab15a..28807978bb46 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -117,7 +117,13 @@ extern int opal_enter_rtas(struct rtas_args *args, #define OPAL_SET_SLOT_LED_STATUS 55 #define OPAL_GET_EPOW_STATUS 56 #define OPAL_SET_SYSTEM_ATTENTION_LED 57 +#define OPAL_RESERVED1 58 +#define OPAL_RESERVED2 59 +#define OPAL_PCI_NEXT_ERROR 60 +#define OPAL_PCI_EEH_FREEZE_STATUS2 61 +#define OPAL_PCI_POLL 62 #define OPAL_PCI_MSI_EOI 63 +#define OPAL_PCI_GET_PHB_DIAG_DATA2 64 #ifndef __ASSEMBLY__ @@ -125,6 +131,7 @@ extern int opal_enter_rtas(struct rtas_args *args, enum OpalVendorApiTokens { OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999 }; + enum OpalFreezeState { OPAL_EEH_STOPPED_NOT_FROZEN = 0, OPAL_EEH_STOPPED_MMIO_FREEZE = 1, @@ -134,55 +141,69 @@ enum OpalFreezeState { OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5, OPAL_EEH_STOPPED_PERM_UNAVAIL = 6 }; + enum OpalEehFreezeActionToken { OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1, OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3 }; + enum OpalPciStatusToken { - OPAL_EEH_PHB_NO_ERROR = 0, - OPAL_EEH_PHB_FATAL = 1, - OPAL_EEH_PHB_RECOVERABLE = 2, - OPAL_EEH_PHB_BUS_ERROR = 3, - OPAL_EEH_PCI_NO_DEVSEL = 4, - OPAL_EEH_PCI_TA = 5, - OPAL_EEH_PCIEX_UR = 6, - OPAL_EEH_PCIEX_CA = 7, - OPAL_EEH_PCI_MMIO_ERROR = 8, - OPAL_EEH_PCI_DMA_ERROR = 9 + OPAL_EEH_NO_ERROR = 0, + OPAL_EEH_IOC_ERROR = 1, + OPAL_EEH_PHB_ERROR = 2, + OPAL_EEH_PE_ERROR = 3, + OPAL_EEH_PE_MMIO_ERROR = 4, + OPAL_EEH_PE_DMA_ERROR = 5 }; + +enum OpalPciErrorSeverity { + OPAL_EEH_SEV_NO_ERROR = 0, + OPAL_EEH_SEV_IOC_DEAD = 1, + OPAL_EEH_SEV_PHB_DEAD = 2, + OPAL_EEH_SEV_PHB_FENCED = 3, + OPAL_EEH_SEV_PE_ER = 4, + OPAL_EEH_SEV_INF = 5 +}; + enum OpalShpcAction { OPAL_SHPC_GET_LINK_STATE = 0, OPAL_SHPC_GET_SLOT_STATE = 1 }; + enum OpalShpcLinkState { OPAL_SHPC_LINK_DOWN = 0, OPAL_SHPC_LINK_UP = 1 }; + enum OpalMmioWindowType { OPAL_M32_WINDOW_TYPE = 1, OPAL_M64_WINDOW_TYPE = 2, OPAL_IO_WINDOW_TYPE = 3 }; + enum OpalShpcSlotState { OPAL_SHPC_DEV_NOT_PRESENT = 0, OPAL_SHPC_DEV_PRESENT = 1 }; + enum OpalExceptionHandler { OPAL_MACHINE_CHECK_HANDLER = 1, OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2, OPAL_SOFTPATCH_HANDLER = 3 }; + enum OpalPendingState { - OPAL_EVENT_OPAL_INTERNAL = 0x1, - OPAL_EVENT_NVRAM = 0x2, - OPAL_EVENT_RTC = 0x4, - OPAL_EVENT_CONSOLE_OUTPUT = 0x8, - OPAL_EVENT_CONSOLE_INPUT = 0x10, - OPAL_EVENT_ERROR_LOG_AVAIL = 0x20, - OPAL_EVENT_ERROR_LOG = 0x40, - OPAL_EVENT_EPOW = 0x80, - OPAL_EVENT_LED_STATUS = 0x100 + OPAL_EVENT_OPAL_INTERNAL = 0x1, + OPAL_EVENT_NVRAM = 0x2, + OPAL_EVENT_RTC = 0x4, + OPAL_EVENT_CONSOLE_OUTPUT = 0x8, + OPAL_EVENT_CONSOLE_INPUT = 0x10, + OPAL_EVENT_ERROR_LOG_AVAIL = 0x20, + OPAL_EVENT_ERROR_LOG = 0x40, + OPAL_EVENT_EPOW = 0x80, + OPAL_EVENT_LED_STATUS = 0x100, + OPAL_EVENT_PCI_ERROR = 0x200 }; /* Machine check related definitions */ @@ -364,15 +385,80 @@ struct opal_machine_check_event { } u; }; +enum { + OPAL_P7IOC_DIAG_TYPE_NONE = 0, + OPAL_P7IOC_DIAG_TYPE_RGC = 1, + OPAL_P7IOC_DIAG_TYPE_BI = 2, + OPAL_P7IOC_DIAG_TYPE_CI = 3, + OPAL_P7IOC_DIAG_TYPE_MISC = 4, + OPAL_P7IOC_DIAG_TYPE_I2C = 5, + OPAL_P7IOC_DIAG_TYPE_LAST = 6 +}; + +struct OpalIoP7IOCErrorData { + uint16_t type; + + /* GEM */ + uint64_t gemXfir; + uint64_t gemRfir; + uint64_t gemRirqfir; + uint64_t gemMask; + uint64_t gemRwof; + + /* LEM */ + uint64_t lemFir; + uint64_t lemErrMask; + uint64_t lemAction0; + uint64_t lemAction1; + uint64_t lemWof; + + union { + struct OpalIoP7IOCRgcErrorData { + uint64_t rgcStatus; /* 3E1C10 */ + uint64_t rgcLdcp; /* 3E1C18 */ + }rgc; + struct OpalIoP7IOCBiErrorData { + uint64_t biLdcp0; /* 3C0100, 3C0118 */ + uint64_t biLdcp1; /* 3C0108, 3C0120 */ + uint64_t biLdcp2; /* 3C0110, 3C0128 */ + uint64_t biFenceStatus; /* 3C0130, 3C0130 */ + + uint8_t biDownbound; /* BI Downbound or Upbound */ + }bi; + struct OpalIoP7IOCCiErrorData { + uint64_t ciPortStatus; /* 3Dn008 */ + uint64_t ciPortLdcp; /* 3Dn010 */ + + uint8_t ciPort; /* Index of CI port: 0/1 */ + }ci; + }; +}; + /** * This structure defines the overlay which will be used to store PHB error * data upon request. */ +enum { + OPAL_PHB_ERROR_DATA_VERSION_1 = 1, +}; + +enum { + OPAL_PHB_ERROR_DATA_TYPE_P7IOC = 1, +}; + enum { OPAL_P7IOC_NUM_PEST_REGS = 128, }; +struct OpalIoPhbErrorCommon { + uint32_t version; + uint32_t ioType; + uint32_t len; +}; + struct OpalIoP7IOCPhbErrorData { + struct OpalIoPhbErrorCommon common; + uint32_t brdgCtl; // P7IOC utl regs @@ -530,14 +616,21 @@ int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number, uint64_t pci_mem_size); int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state); -int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer, uint64_t diag_buffer_len); -int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer, uint64_t diag_buffer_len); +int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer, + uint64_t diag_buffer_len); +int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer, + uint64_t diag_buffer_len); +int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id, void *diag_buffer, + uint64_t diag_buffer_len); int64_t opal_pci_fence_phb(uint64_t phb_id); int64_t opal_pci_reinit(uint64_t phb_id, uint8_t reinit_scope); int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action); int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action); int64_t opal_get_epow_status(uint64_t *status); int64_t opal_set_system_attention_led(uint8_t led_action); +int64_t opal_pci_next_error(uint64_t phb_id, uint64_t *first_frozen_pe, + uint16_t *pci_error_type, uint16_t *severity); +int64_t opal_pci_poll(uint64_t phb_id); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 6fabe92eafb6..e88863ffb135 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -107,4 +107,7 @@ OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index e16b729f46f9..5edceb7f746d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -203,7 +203,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_lock_irqsave(&phb->lock, flags); - rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, + PNV_PCI_DIAG_BUF_SIZE); has_diag = (rc == OPAL_SUCCESS); rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, From 8747f36324bbe7f762bd9744d2dd20ebda021547 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:06 +0800 Subject: [PATCH 060/156] powerpc/eeh: EEH backend for P7IOC For EEH on PowerNV platform, the overall architecture is different from that on pSeries platform. In order to support multiple I/O chips in future, we split EEH to 3 layers for PowerNV platform: EEH core, platform layer, I/O layer. It would give EEH implementation on PowerNV platform much more flexibility in future. The patch adds the EEH backend for P7IOC. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/eeh-ioda.c | 45 +++++++++++++++++++++++ arch/powerpc/platforms/powernv/pci.h | 23 ++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 arch/powerpc/platforms/powernv/eeh-ioda.c diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index bcc3cb48a44e..09bd0cbb7c4f 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -3,3 +3,4 @@ obj-y += opal-rtc.o opal-nvram.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o +obj-$(CONFIG_EEH) += eeh-ioda.o diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c new file mode 100644 index 000000000000..f12e8887b144 --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -0,0 +1,45 @@ +/* + * The file intends to implement the functions needed by EEH, which is + * built on IODA compliant chip. Actually, lots of functions related + * to EEH would be built based on the OPAL APIs. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "powernv.h" +#include "pci.h" + +struct pnv_eeh_ops ioda_eeh_ops = { + .post_init = NULL, + .set_option = NULL, + .get_state = NULL, + .reset = NULL, + .get_log = NULL, + .configure_bridge = NULL, + .next_error = NULL +}; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 25d76c4df50b..336c9dc1b314 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -66,15 +66,35 @@ struct pnv_ioda_pe { struct list_head list; }; +/* IOC dependent EEH operations */ +#ifdef CONFIG_EEH +struct pnv_eeh_ops { + int (*post_init)(struct pci_controller *hose); + int (*set_option)(struct eeh_pe *pe, int option); + int (*get_state)(struct eeh_pe *pe); + int (*reset)(struct eeh_pe *pe, int option); + int (*get_log)(struct eeh_pe *pe, int severity, + char *drv_log, unsigned long len); + int (*configure_bridge)(struct eeh_pe *pe); + int (*next_error)(struct eeh_pe **pe); +}; +#endif /* CONFIG_EEH */ + struct pnv_phb { struct pci_controller *hose; enum pnv_phb_type type; enum pnv_phb_model model; + u64 hub_id; u64 opal_id; void __iomem *regs; int initialized; spinlock_t lock; +#ifdef CONFIG_EEH + struct pnv_eeh_ops *eeh_ops; + int eeh_enabled; +#endif + #ifdef CONFIG_PCI_MSI unsigned int msi_base; unsigned int msi32_support; @@ -150,6 +170,9 @@ struct pnv_phb { }; extern struct pci_ops pnv_pci_ops; +#ifdef CONFIG_EEH +extern struct pnv_eeh_ops ioda_eeh_ops; +#endif extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, From 73370c662b4e453185201b44b775a49e95870009 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:07 +0800 Subject: [PATCH 061/156] powerpc/eeh: I/O chip post initialization The post initialization (struct eeh_ops::post_init) is called after the EEH probe is done. On the other hand, the EEH core post initialization is designed to call platform and then I/O chip backend on PowerNV platform. The patch adds the backend for I/O chip to notify the platform that the specific PHB is ready to supply EEH service. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index f12e8887b144..7b272416b9cc 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -34,8 +34,27 @@ #include "powernv.h" #include "pci.h" +/** + * ioda_eeh_post_init - Chip dependent post initialization + * @hose: PCI controller + * + * The function will be called after eeh PEs and devices + * have been built. That means the EEH is ready to supply + * service with I/O cache. + */ +static int ioda_eeh_post_init(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + + /* FIXME: Enable it for PHB3 later */ + if (phb->type == PNV_PHB_IODA1) + phb->eeh_enabled = 1; + + return 0; +} + struct pnv_eeh_ops ioda_eeh_ops = { - .post_init = NULL, + .post_init = ioda_eeh_post_init, .set_option = NULL, .get_state = NULL, .reset = NULL, From eb0059836baa14c58ebad030684846213aaece89 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:08 +0800 Subject: [PATCH 062/156] powerpc/eeh: I/O chip EEH enable option The patch adds the backend to enable or disable EEH functionality for the specified PE. The backend is also used to enable MMIO or DMA path for the problematic PE. It's notable that all PEs on PowerNV platform support EEH functionality by default, and we disallow to disable EEH for the specific PE. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 65 ++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 7b272416b9cc..744eb9e39be2 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -53,9 +53,72 @@ static int ioda_eeh_post_init(struct pci_controller *hose) return 0; } +/** + * ioda_eeh_set_option - Set EEH operation or I/O setting + * @pe: EEH PE + * @option: options + * + * Enable or disable EEH option for the indicated PE. The + * function also can be used to enable I/O or DMA for the + * PE. + */ +static int ioda_eeh_set_option(struct eeh_pe *pe, int option) +{ + s64 ret; + u32 pe_no; + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + + /* Check on PE number */ + if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) { + pr_err("%s: PE address %x out of range [0, %x] " + "on PHB#%x\n", + __func__, pe->addr, phb->ioda.total_pe, + hose->global_number); + return -EINVAL; + } + + pe_no = pe->addr; + switch (option) { + case EEH_OPT_DISABLE: + ret = -EEXIST; + break; + case EEH_OPT_ENABLE: + ret = 0; + break; + case EEH_OPT_THAW_MMIO: + ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO); + if (ret) { + pr_warning("%s: Failed to enable MMIO for " + "PHB#%x-PE#%x, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return -EIO; + } + + break; + case EEH_OPT_THAW_DMA: + ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_DMA); + if (ret) { + pr_warning("%s: Failed to enable DMA for " + "PHB#%x-PE#%x, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return -EIO; + } + + break; + default: + pr_warning("%s: Invalid option %d\n", __func__, option); + return -EINVAL; + } + + return ret; +} + struct pnv_eeh_ops ioda_eeh_ops = { .post_init = ioda_eeh_post_init, - .set_option = NULL, + .set_option = ioda_eeh_set_option, .get_state = NULL, .reset = NULL, .get_log = NULL, From 8c41a7f3f7593fe57578f3f649232a5842d4c65d Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:09 +0800 Subject: [PATCH 063/156] powerpc/eeh: I/O chip EEH state retrieval The patch adds I/O chip backend to retrieve the state for the indicated PE. While the PE state is temperarily unavailable, the upper layer (powernv platform) should return default delay (1 second). Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 99 ++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 744eb9e39be2..a76870b6a184 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -116,10 +116,107 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int option) return ret; } +/** + * ioda_eeh_get_state - Retrieve the state of PE + * @pe: EEH PE + * + * The PE's state should be retrieved from the PEEV, PEST + * IODA tables. Since the OPAL has exported the function + * to do it, it'd better to use that. + */ +static int ioda_eeh_get_state(struct eeh_pe *pe) +{ + s64 ret = 0; + u8 fstate; + u16 pcierr; + u32 pe_no; + int result; + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + + /* + * Sanity check on PE address. The PHB PE address should + * be zero. + */ + if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) { + pr_err("%s: PE address %x out of range [0, %x] " + "on PHB#%x\n", + __func__, pe->addr, phb->ioda.total_pe, + hose->global_number); + return EEH_STATE_NOT_SUPPORT; + } + + /* Retrieve PE status through OPAL */ + pe_no = pe->addr; + ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, + &fstate, &pcierr, NULL); + if (ret) { + pr_err("%s: Failed to get EEH status on " + "PHB#%x-PE#%x\n, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return EEH_STATE_NOT_SUPPORT; + } + + /* Check PHB status */ + if (pe->type & EEH_PE_PHB) { + result = 0; + result &= ~EEH_STATE_RESET_ACTIVE; + + if (pcierr != OPAL_EEH_PHB_ERROR) { + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + result |= EEH_STATE_DMA_ENABLED; + } + + return result; + } + + /* Parse result out */ + result = 0; + switch (fstate) { + case OPAL_EEH_STOPPED_NOT_FROZEN: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + result |= EEH_STATE_DMA_ENABLED; + break; + case OPAL_EEH_STOPPED_MMIO_FREEZE: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_DMA_ENABLED; + break; + case OPAL_EEH_STOPPED_DMA_FREEZE: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + break; + case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE: + result &= ~EEH_STATE_RESET_ACTIVE; + break; + case OPAL_EEH_STOPPED_RESET: + result |= EEH_STATE_RESET_ACTIVE; + break; + case OPAL_EEH_STOPPED_TEMP_UNAVAIL: + result |= EEH_STATE_UNAVAILABLE; + break; + case OPAL_EEH_STOPPED_PERM_UNAVAIL: + result |= EEH_STATE_NOT_SUPPORT; + break; + default: + pr_warning("%s: Unexpected EEH status 0x%x " + "on PHB#%x-PE#%x\n", + __func__, fstate, hose->global_number, pe_no); + } + + return result; +} + struct pnv_eeh_ops ioda_eeh_ops = { .post_init = ioda_eeh_post_init, .set_option = ioda_eeh_set_option, - .get_state = NULL, + .get_state = ioda_eeh_get_state, .reset = NULL, .get_log = NULL, .configure_bridge = NULL, From 9d5cab00108f42a5ab51cfb1fcb03130679a5762 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:10 +0800 Subject: [PATCH 064/156] powerpc/eeh: I/O chip PE reset The patch adds the I/O chip backend to do PE reset. For now, we focus on PCI bus dependent PE. If PHB PE has been put into error state, the PHB will take complete reset. Besides, the root bridge will take fundamental or hot reset accordingly if the indicated PE locates at the toppest of PCI hierarchy tree. Otherwise, the upstream p2p bridge will take hot reset. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 234 +++++++++++++++++++++- 1 file changed, 233 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index a76870b6a184..ea5fa05f8bbf 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -213,11 +213,243 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) return result; } +static int ioda_eeh_pe_clear(struct eeh_pe *pe) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + u32 pe_no; + u8 fstate; + u16 pcierr; + s64 ret; + + pe_no = pe->addr; + hose = pe->phb; + phb = pe->phb->private_data; + + /* Clear the EEH error on the PE */ + ret = opal_pci_eeh_freeze_clear(phb->opal_id, + pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + if (ret) { + pr_err("%s: Failed to clear EEH error for " + "PHB#%x-PE#%x, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return -EIO; + } + + /* + * Read the PE state back and verify that the frozen + * state has been removed. + */ + ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, + &fstate, &pcierr, NULL); + if (ret) { + pr_err("%s: Failed to get EEH status on " + "PHB#%x-PE#%x\n, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return -EIO; + } + + if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) { + pr_err("%s: Frozen state not cleared on " + "PHB#%x-PE#%x, sts=%x\n", + __func__, hose->global_number, pe_no, fstate); + return -EIO; + } + + return 0; +} + +static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) +{ + s64 rc = OPAL_HARDWARE; + + while (1) { + rc = opal_pci_poll(phb->opal_id); + if (rc <= 0) + break; + + msleep(rc); + } + + return rc; +} + +static int ioda_eeh_phb_reset(struct pci_controller *hose, int option) +{ + struct pnv_phb *phb = hose->private_data; + s64 rc = OPAL_HARDWARE; + + pr_debug("%s: Reset PHB#%x, option=%d\n", + __func__, hose->global_number, option); + + /* Issue PHB complete reset request */ + if (option == EEH_RESET_FUNDAMENTAL || + option == EEH_RESET_HOT) + rc = opal_pci_reset(phb->opal_id, + OPAL_PHB_COMPLETE, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_DEACTIVATE) + rc = opal_pci_reset(phb->opal_id, + OPAL_PHB_COMPLETE, + OPAL_DEASSERT_RESET); + if (rc < 0) + goto out; + + /* + * Poll state of the PHB until the request is done + * successfully. + */ + rc = ioda_eeh_phb_poll(phb); +out: + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} + +static int ioda_eeh_root_reset(struct pci_controller *hose, int option) +{ + struct pnv_phb *phb = hose->private_data; + s64 rc = OPAL_SUCCESS; + + pr_debug("%s: Reset PHB#%x, option=%d\n", + __func__, hose->global_number, option); + + /* + * During the reset deassert time, we needn't care + * the reset scope because the firmware does nothing + * for fundamental or hot reset during deassert phase. + */ + if (option == EEH_RESET_FUNDAMENTAL) + rc = opal_pci_reset(phb->opal_id, + OPAL_PCI_FUNDAMENTAL_RESET, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_HOT) + rc = opal_pci_reset(phb->opal_id, + OPAL_PCI_HOT_RESET, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_DEACTIVATE) + rc = opal_pci_reset(phb->opal_id, + OPAL_PCI_HOT_RESET, + OPAL_DEASSERT_RESET); + if (rc < 0) + goto out; + + /* Poll state of the PHB until the request is done */ + rc = ioda_eeh_phb_poll(phb); +out: + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} + +static int ioda_eeh_bridge_reset(struct pci_controller *hose, + struct pci_dev *dev, int option) +{ + u16 ctrl; + + pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n", + __func__, hose->global_number, dev->bus->number, + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option); + + switch (option) { + case EEH_RESET_FUNDAMENTAL: + case EEH_RESET_HOT: + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + ctrl |= PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + break; + case EEH_RESET_DEACTIVATE: + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + break; + } + + return 0; +} + +/** + * ioda_eeh_reset - Reset the indicated PE + * @pe: EEH PE + * @option: reset option + * + * Do reset on the indicated PE. For PCI bus sensitive PE, + * we need to reset the parent p2p bridge. The PHB has to + * be reinitialized if the p2p bridge is root bridge. For + * PCI device sensitive PE, we will try to reset the device + * through FLR. For now, we don't have OPAL APIs to do HARD + * reset yet, so all reset would be SOFT (HOT) reset. + */ +static int ioda_eeh_reset(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct eeh_dev *edev; + struct pci_dev *dev; + int ret; + + /* + * Anyway, we have to clear the problematic state for the + * corresponding PE. However, we needn't do it if the PE + * is PHB associated. That means the PHB is having fatal + * errors and it needs reset. Further more, the AIB interface + * isn't reliable any more. + */ + if (!(pe->type & EEH_PE_PHB) && + (option == EEH_RESET_HOT || + option == EEH_RESET_FUNDAMENTAL)) { + ret = ioda_eeh_pe_clear(pe); + if (ret) + return -EIO; + } + + /* + * The rules applied to reset, either fundamental or hot reset: + * + * We always reset the direct upstream bridge of the PE. If the + * direct upstream bridge isn't root bridge, we always take hot + * reset no matter what option (fundamental or hot) is. Otherwise, + * we should do the reset according to the required option. + */ + if (pe->type & EEH_PE_PHB) { + ret = ioda_eeh_phb_reset(hose, option); + } else { + if (pe->type & EEH_PE_DEVICE) { + /* + * If it's device PE, we didn't refer to the parent + * PCI bus yet. So we have to figure it out indirectly. + */ + edev = list_first_entry(&pe->edevs, + struct eeh_dev, list); + dev = eeh_dev_to_pci_dev(edev); + dev = dev->bus->self; + } else { + /* + * If it's bus PE, the parent PCI bus is already there + * and just pick it up. + */ + dev = pe->bus->self; + } + + /* + * Do reset based on the fact that the direct upstream bridge + * is root bridge (port) or not. + */ + if (dev->bus->number == 0) + ret = ioda_eeh_root_reset(hose, option); + else + ret = ioda_eeh_bridge_reset(hose, dev, option); + } + + return ret; +} + struct pnv_eeh_ops ioda_eeh_ops = { .post_init = ioda_eeh_post_init, .set_option = ioda_eeh_set_option, .get_state = ioda_eeh_get_state, - .reset = NULL, + .reset = ioda_eeh_reset, .get_log = NULL, .configure_bridge = NULL, .next_error = NULL From bf90dfea2397fe136ce35bc896c3bc84133272c6 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:11 +0800 Subject: [PATCH 065/156] powerpc/eeh: I/O chip PE log and bridge setup The patch adds backends to retrieve error log and configure p2p bridges for the indicated PE. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 57 ++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index ea5fa05f8bbf..8d9c2d232d2a 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -445,12 +445,65 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option) return ret; } +/** + * ioda_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: Severity level of the log + * @drv_log: buffer to store the log + * @len: space of the log buffer + * + * The function is used to retrieve error log from P7IOC. + */ +static int ioda_eeh_get_log(struct eeh_pe *pe, int severity, + char *drv_log, unsigned long len) +{ + s64 ret; + unsigned long flags; + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + + spin_lock_irqsave(&phb->lock, flags); + + ret = opal_pci_get_phb_diag_data2(phb->opal_id, + phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + if (ret) { + spin_unlock_irqrestore(&phb->lock, flags); + pr_warning("%s: Failed to get log for PHB#%x-PE#%x\n", + __func__, hose->global_number, pe->addr); + return -EIO; + } + + /* + * FIXME: We probably need log the error in somewhere. + * Lets make it up in future. + */ + /* pr_info("%s", phb->diag.blob); */ + + spin_unlock_irqrestore(&phb->lock, flags); + + return 0; +} + +/** + * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE + * @pe: EEH PE + * + * For particular PE, it might have included PCI bridges. In order + * to make the PE work properly, those PCI bridges should be configured + * correctly. However, we need do nothing on P7IOC since the reset + * function will do everything that should be covered by the function. + */ +static int ioda_eeh_configure_bridge(struct eeh_pe *pe) +{ + return 0; +} + struct pnv_eeh_ops ioda_eeh_ops = { .post_init = ioda_eeh_post_init, .set_option = ioda_eeh_set_option, .get_state = ioda_eeh_get_state, .reset = ioda_eeh_reset, - .get_log = NULL, - .configure_bridge = NULL, + .get_log = ioda_eeh_get_log, + .configure_bridge = ioda_eeh_configure_bridge, .next_error = NULL }; From 70f942db4669c4417b7bb4f3353b3eddf1179aae Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:12 +0800 Subject: [PATCH 066/156] powerpc/eeh: I/O chip next error The patch implements the backend for EEH core to retrieve next EEH error to handle. For the informational errors, we won't bother the EEH core. Otherwise, the EEH should take appropriate actions depending on the return value: 0 - No further errors detected 1 - Frozen PE 2 - Fenced PHB 3 - Dead PHB 4 - Dead IOC Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 334 +++++++++++++++++++++- arch/powerpc/platforms/powernv/pci.h | 1 + 2 files changed, 333 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 8d9c2d232d2a..a3eebd193dfe 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -34,6 +34,15 @@ #include "powernv.h" #include "pci.h" +/* Debugging option */ +#ifdef IODA_EEH_DBG_ON +#define IODA_EEH_DBG(args...) pr_info(args) +#else +#define IODA_EEH_DBG(args...) +#endif + +static char *hub_diag = NULL; + /** * ioda_eeh_post_init - Chip dependent post initialization * @hose: PCI controller @@ -47,8 +56,19 @@ static int ioda_eeh_post_init(struct pci_controller *hose) struct pnv_phb *phb = hose->private_data; /* FIXME: Enable it for PHB3 later */ - if (phb->type == PNV_PHB_IODA1) + if (phb->type == PNV_PHB_IODA1) { + if (!hub_diag) { + hub_diag = (char *)__get_free_page(GFP_KERNEL | + __GFP_ZERO); + if (!hub_diag) { + pr_err("%s: Out of memory !\n", + __func__); + return -ENOMEM; + } + } + phb->eeh_enabled = 1; + } return 0; } @@ -498,6 +518,316 @@ static int ioda_eeh_configure_bridge(struct eeh_pe *pe) return 0; } +static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data) +{ + /* GEM */ + pr_info(" GEM XFIR: %016llx\n", data->gemXfir); + pr_info(" GEM RFIR: %016llx\n", data->gemRfir); + pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir); + pr_info(" GEM Mask: %016llx\n", data->gemMask); + pr_info(" GEM RWOF: %016llx\n", data->gemRwof); + + /* LEM */ + pr_info(" LEM FIR: %016llx\n", data->lemFir); + pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask); + pr_info(" LEM Action 0: %016llx\n", data->lemAction0); + pr_info(" LEM Action 1: %016llx\n", data->lemAction1); + pr_info(" LEM WOF: %016llx\n", data->lemWof); +} + +static void ioda_eeh_hub_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + struct OpalIoP7IOCErrorData *data; + long rc; + + data = (struct OpalIoP7IOCErrorData *)ioda_eeh_hub_diag; + rc = opal_pci_get_hub_diag_data(phb->hub_id, data, PAGE_SIZE); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n", + __func__, phb->hub_id, rc); + return; + } + + switch (data->type) { + case OPAL_P7IOC_DIAG_TYPE_RGC: + pr_info("P7IOC diag-data for RGC\n\n"); + ioda_eeh_hub_diag_common(data); + pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus); + pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp); + break; + case OPAL_P7IOC_DIAG_TYPE_BI: + pr_info("P7IOC diag-data for BI %s\n\n", + data->bi.biDownbound ? "Downbound" : "Upbound"); + ioda_eeh_hub_diag_common(data); + pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0); + pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1); + pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2); + pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus); + break; + case OPAL_P7IOC_DIAG_TYPE_CI: + pr_info("P7IOC diag-data for CI Port %d\\nn", + data->ci.ciPort); + ioda_eeh_hub_diag_common(data); + pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus); + pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp); + break; + case OPAL_P7IOC_DIAG_TYPE_MISC: + pr_info("P7IOC diag-data for MISC\n\n"); + ioda_eeh_hub_diag_common(data); + break; + case OPAL_P7IOC_DIAG_TYPE_I2C: + pr_info("P7IOC diag-data for I2C\n\n"); + ioda_eeh_hub_diag_common(data); + break; + default: + pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n", + __func__, phb->hub_id, data->type); + } +} + +static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoP7IOCPhbErrorData *data; + int i; + + data = (struct OpalIoP7IOCPhbErrorData *)common; + + pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n", + hose->global_number, common->version); + + pr_info(" brdgCtl: %08x\n", data->brdgCtl); + + pr_info(" portStatusReg: %08x\n", data->portStatusReg); + pr_info(" rootCmplxStatus: %08x\n", data->rootCmplxStatus); + pr_info(" busAgentStatus: %08x\n", data->busAgentStatus); + + pr_info(" deviceStatus: %08x\n", data->deviceStatus); + pr_info(" slotStatus: %08x\n", data->slotStatus); + pr_info(" linkStatus: %08x\n", data->linkStatus); + pr_info(" devCmdStatus: %08x\n", data->devCmdStatus); + pr_info(" devSecStatus: %08x\n", data->devSecStatus); + + pr_info(" rootErrorStatus: %08x\n", data->rootErrorStatus); + pr_info(" uncorrErrorStatus: %08x\n", data->uncorrErrorStatus); + pr_info(" corrErrorStatus: %08x\n", data->corrErrorStatus); + pr_info(" tlpHdr1: %08x\n", data->tlpHdr1); + pr_info(" tlpHdr2: %08x\n", data->tlpHdr2); + pr_info(" tlpHdr3: %08x\n", data->tlpHdr3); + pr_info(" tlpHdr4: %08x\n", data->tlpHdr4); + pr_info(" sourceId: %08x\n", data->sourceId); + + pr_info(" errorClass: %016llx\n", data->errorClass); + pr_info(" correlator: %016llx\n", data->correlator); + pr_info(" p7iocPlssr: %016llx\n", data->p7iocPlssr); + pr_info(" p7iocCsr: %016llx\n", data->p7iocCsr); + pr_info(" lemFir: %016llx\n", data->lemFir); + pr_info(" lemErrorMask: %016llx\n", data->lemErrorMask); + pr_info(" lemWOF: %016llx\n", data->lemWOF); + pr_info(" phbErrorStatus: %016llx\n", data->phbErrorStatus); + pr_info(" phbFirstErrorStatus: %016llx\n", data->phbFirstErrorStatus); + pr_info(" phbErrorLog0: %016llx\n", data->phbErrorLog0); + pr_info(" phbErrorLog1: %016llx\n", data->phbErrorLog1); + pr_info(" mmioErrorStatus: %016llx\n", data->mmioErrorStatus); + pr_info(" mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus); + pr_info(" mmioErrorLog0: %016llx\n", data->mmioErrorLog0); + pr_info(" mmioErrorLog1: %016llx\n", data->mmioErrorLog1); + pr_info(" dma0ErrorStatus: %016llx\n", data->dma0ErrorStatus); + pr_info(" dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus); + pr_info(" dma0ErrorLog0: %016llx\n", data->dma0ErrorLog0); + pr_info(" dma0ErrorLog1: %016llx\n", data->dma0ErrorLog1); + pr_info(" dma1ErrorStatus: %016llx\n", data->dma1ErrorStatus); + pr_info(" dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus); + pr_info(" dma1ErrorLog0: %016llx\n", data->dma1ErrorLog0); + pr_info(" dma1ErrorLog1: %016llx\n", data->dma1ErrorLog1); + + for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { + if ((data->pestA[i] >> 63) == 0 && + (data->pestB[i] >> 63) == 0) + continue; + + pr_info(" PE[%3d] PESTA: %016llx\n", i, data->pestA[i]); + pr_info(" PESTB: %016llx\n", data->pestB[i]); + } +} + +static void ioda_eeh_phb_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + struct OpalIoPhbErrorCommon *common; + long rc; + + common = (struct OpalIoPhbErrorCommon *)phb->diag.blob; + rc = opal_pci_get_phb_diag_data2(phb->opal_id, common, PAGE_SIZE); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", + __func__, hose->global_number, rc); + return; + } + + switch (common->ioType) { + case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: + ioda_eeh_p7ioc_phb_diag(hose, common); + break; + default: + pr_warning("%s: Unrecognized I/O chip %d\n", + __func__, common->ioType); + } +} + +static int ioda_eeh_get_phb_pe(struct pci_controller *hose, + struct eeh_pe **pe) +{ + struct eeh_pe *phb_pe; + + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) { + pr_warning("%s Can't find PE for PHB#%d\n", + __func__, hose->global_number); + return -EEXIST; + } + + *pe = phb_pe; + return 0; +} + +static int ioda_eeh_get_pe(struct pci_controller *hose, + u16 pe_no, struct eeh_pe **pe) +{ + struct eeh_pe *phb_pe, *dev_pe; + struct eeh_dev dev; + + /* Find the PHB PE */ + if (ioda_eeh_get_phb_pe(hose, &phb_pe)) + return -EEXIST; + + /* Find the PE according to PE# */ + memset(&dev, 0, sizeof(struct eeh_dev)); + dev.phb = hose; + dev.pe_config_addr = pe_no; + dev_pe = eeh_pe_get(&dev); + if (!dev_pe) { + pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n", + __func__, hose->global_number, pe_no); + return -EEXIST; + } + + *pe = dev_pe; + return 0; +} + +/** + * ioda_eeh_next_error - Retrieve next error for EEH core to handle + * @pe: The affected PE + * + * The function is expected to be called by EEH core while it gets + * special EEH event (without binding PE). The function calls to + * OPAL APIs for next error to handle. The informational error is + * handled internally by platform. However, the dead IOC, dead PHB, + * fenced PHB and frozen PE should be handled by EEH core eventually. + */ +static int ioda_eeh_next_error(struct eeh_pe **pe) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + u64 frozen_pe_no; + u16 err_type, severity; + long rc; + int ret = 1; + + /* While running here, it's safe to purge the event queue */ + eeh_remove_event(NULL); + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + /* + * If the subordinate PCI buses of the PHB has been + * removed, we needn't take care of it any more. + */ + phb = hose->private_data; + if (phb->removed) + continue; + + rc = opal_pci_next_error(phb->opal_id, + &frozen_pe_no, &err_type, &severity); + + /* If OPAL API returns error, we needn't proceed */ + if (rc != OPAL_SUCCESS) { + IODA_EEH_DBG("%s: Invalid return value on " + "PHB#%x (0x%lx) from opal_pci_next_error", + __func__, hose->global_number, rc); + continue; + } + + /* If the PHB doesn't have error, stop processing */ + if (err_type == OPAL_EEH_NO_ERROR || + severity == OPAL_EEH_SEV_NO_ERROR) { + IODA_EEH_DBG("%s: No error found on PHB#%x\n", + __func__, hose->global_number); + continue; + } + + /* + * Processing the error. We're expecting the error with + * highest priority reported upon multiple errors on the + * specific PHB. + */ + IODA_EEH_DBG("%s: Error (%d, %d, %d) on PHB#%x\n", + err_type, severity, pe_no, hose->global_number); + switch (err_type) { + case OPAL_EEH_IOC_ERROR: + if (severity == OPAL_EEH_SEV_IOC_DEAD) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb = hose->private_data; + phb->removed = 1; + } + + WARN(1, "EEH: dead IOC detected\n"); + ret = 4; + goto out; + } else if (severity == OPAL_EEH_SEV_INF) + ioda_eeh_hub_diag(hose); + + break; + case OPAL_EEH_PHB_ERROR: + if (severity == OPAL_EEH_SEV_PHB_DEAD) { + if (ioda_eeh_get_phb_pe(hose, pe)) + break; + + WARN(1, "EEH: dead PHB#%x detected\n", + hose->global_number); + phb->removed = 1; + ret = 3; + goto out; + } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { + if (ioda_eeh_get_phb_pe(hose, pe)) + break; + + WARN(1, "EEH: fenced PHB#%x detected\n", + hose->global_number); + ret = 2; + goto out; + } else if (severity == OPAL_EEH_SEV_INF) + ioda_eeh_phb_diag(hose); + + break; + case OPAL_EEH_PE_ERROR: + if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) + break; + + WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n", + (*pe)->addr, (*pe)->phb->global_number); + ret = 1; + goto out; + } + } + + ret = 0; +out: + return ret; +} + struct pnv_eeh_ops ioda_eeh_ops = { .post_init = ioda_eeh_post_init, .set_option = ioda_eeh_set_option, @@ -505,5 +835,5 @@ struct pnv_eeh_ops ioda_eeh_ops = { .reset = ioda_eeh_reset, .get_log = ioda_eeh_get_log, .configure_bridge = ioda_eeh_configure_bridge, - .next_error = NULL + .next_error = ioda_eeh_next_error }; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 336c9dc1b314..3656a2409e9a 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -93,6 +93,7 @@ struct pnv_phb { #ifdef CONFIG_EEH struct pnv_eeh_ops *eeh_ops; int eeh_enabled; + int removed; #endif #ifdef CONFIG_PCI_MSI From 29310e5e860697955b9c10ddb448d61267fab0dc Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:13 +0800 Subject: [PATCH 067/156] powerpc/eeh: PowerNV EEH backends The patch adds EEH backends for PowerNV platform. It's notable that part of those EEH backends call to the I/O chip dependent backends. [Removed pointless change to eeh_pseries.c -- BenH] Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 419 +++++++++++++++++++ 2 files changed, 420 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/platforms/powernv/eeh-powernv.c diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 09bd0cbb7c4f..7fe595152478 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -3,4 +3,4 @@ obj-y += opal-rtc.o opal-nvram.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o -obj-$(CONFIG_EEH) += eeh-ioda.o +obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c new file mode 100644 index 000000000000..9559115424d6 --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -0,0 +1,419 @@ +/* + * The file intends to implement the platform dependent EEH operations on + * powernv platform. Actually, the powernv was created in order to fully + * hypervisor support. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "powernv.h" +#include "pci.h" + +/** + * powernv_eeh_init - EEH platform dependent initialization + * + * EEH platform dependent initialization on powernv + */ +static int powernv_eeh_init(void) +{ + /* We require OPALv3 */ + if (!firmware_has_feature(FW_FEATURE_OPALv3)) { + pr_warning("%s: OPALv3 is required !\n", __func__); + return -EINVAL; + } + + /* Set EEH probe mode */ + eeh_probe_mode_set(EEH_PROBE_MODE_DEV); + + return 0; +} + +/** + * powernv_eeh_post_init - EEH platform dependent post initialization + * + * EEH platform dependent post initialization on powernv. When + * the function is called, the EEH PEs and devices should have + * been built. If the I/O cache staff has been built, EEH is + * ready to supply service. + */ +static int powernv_eeh_post_init(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + int ret = 0; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->eeh_ops && phb->eeh_ops->post_init) { + ret = phb->eeh_ops->post_init(hose); + if (ret) + break; + } + } + + return ret; +} + +/** + * powernv_eeh_dev_probe - Do probe on PCI device + * @dev: PCI device + * @flag: unused + * + * When EEH module is installed during system boot, all PCI devices + * are checked one by one to see if it supports EEH. The function + * is introduced for the purpose. By default, EEH has been enabled + * on all PCI devices. That's to say, we only need do necessary + * initialization on the corresponding eeh device and create PE + * accordingly. + * + * It's notable that's unsafe to retrieve the EEH device through + * the corresponding PCI device. During the PCI device hotplug, which + * was possiblly triggered by EEH core, the binding between EEH device + * and the PCI device isn't built yet. + */ +static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct device_node *dn = pci_device_to_OF_node(dev); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + + /* + * When probing the root bridge, which doesn't have any + * subordinate PCI devices. We don't have OF node for + * the root bridge. So it's not reasonable to continue + * the probing. + */ + if (!dn || !edev) + return 0; + + /* Skip for PCI-ISA bridge */ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA) + return 0; + + /* Initialize eeh device */ + edev->class_code = dev->class; + edev->mode = 0; + edev->config_addr = ((dev->bus->number << 8) | dev->devfn); + edev->pe_config_addr = phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff); + + /* Create PE */ + eeh_add_to_parent_pe(edev); + + /* + * Enable EEH explicitly so that we will do EEH check + * while accessing I/O stuff + * + * FIXME: Enable that for PHB3 later + */ + if (phb->type == PNV_PHB_IODA1) + eeh_subsystem_enabled = 1; + + /* Save memory bars */ + eeh_save_bars(edev); + + return 0; +} + +/** + * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable + * @pe: EEH PE + * @option: operation to be issued + * + * The function is used to control the EEH functionality globally. + * Currently, following options are support according to PAPR: + * Enable EEH, Disable EEH, Enable MMIO and Enable DMA + */ +static int powernv_eeh_set_option(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = -EEXIST; + + /* + * What we need do is pass it down for hardware + * implementation to handle it. + */ + if (phb->eeh_ops && phb->eeh_ops->set_option) + ret = phb->eeh_ops->set_option(pe, option); + + return ret; +} + +/** + * powernv_eeh_get_pe_addr - Retrieve PE address + * @pe: EEH PE + * + * Retrieve the PE address according to the given tranditional + * PCI BDF (Bus/Device/Function) address. + */ +static int powernv_eeh_get_pe_addr(struct eeh_pe *pe) +{ + return pe->addr; +} + +/** + * powernv_eeh_get_state - Retrieve PE state + * @pe: EEH PE + * @delay: delay while PE state is temporarily unavailable + * + * Retrieve the state of the specified PE. For IODA-compitable + * platform, it should be retrieved from IODA table. Therefore, + * we prefer passing down to hardware implementation to handle + * it. + */ +static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = EEH_STATE_NOT_SUPPORT; + + if (phb->eeh_ops && phb->eeh_ops->get_state) { + ret = phb->eeh_ops->get_state(pe); + + /* + * If the PE state is temporarily unavailable, + * to inform the EEH core delay for default + * period (1 second) + */ + if (delay) { + *delay = 0; + if (ret & EEH_STATE_UNAVAILABLE) + *delay = 1000; + } + } + + return ret; +} + +/** + * powernv_eeh_reset - Reset the specified PE + * @pe: EEH PE + * @option: reset option + * + * Reset the specified PE + */ +static int powernv_eeh_reset(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = -EEXIST; + + if (phb->eeh_ops && phb->eeh_ops->reset) + ret = phb->eeh_ops->reset(pe, option); + + return ret; +} + +/** + * powernv_eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in microsecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + while (1) { + ret = powernv_eeh_get_state(pe, &mwait); + + /* + * If the PE's state is temporarily unavailable, + * we have to wait for the specified time. Otherwise, + * the PE's state will be returned immediately. + */ + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + max_wait -= mwait; + if (max_wait <= 0) { + pr_warning("%s: Timeout getting PE#%x's state (%d)\n", + __func__, pe->addr, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + msleep(mwait); + } + + return EEH_STATE_NOT_SUPPORT; +} + +/** + * powernv_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * @drv_log: driver log to be combined with retrieved error log + * @len: length of driver log + * + * Retrieve the temporary or permanent error from the PE. + */ +static int powernv_eeh_get_log(struct eeh_pe *pe, int severity, + char *drv_log, unsigned long len) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = -EEXIST; + + if (phb->eeh_ops && phb->eeh_ops->get_log) + ret = phb->eeh_ops->get_log(pe, severity, drv_log, len); + + return ret; +} + +/** + * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE + * @pe: EEH PE + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int powernv_eeh_configure_bridge(struct eeh_pe *pe) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = 0; + + if (phb->eeh_ops && phb->eeh_ops->configure_bridge) + ret = phb->eeh_ops->configure_bridge(pe); + + return ret; +} + +/** + * powernv_eeh_read_config - Read PCI config space + * @dn: device node + * @where: PCI address + * @size: size to read + * @val: return value + * + * Read config space from the speicifed device + */ +static int powernv_eeh_read_config(struct device_node *dn, int where, + int size, u32 *val) +{ + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_controller *hose = edev->phb; + + return hose->ops->read(dev->bus, dev->devfn, where, size, val); +} + +/** + * powernv_eeh_write_config - Write PCI config space + * @dn: device node + * @where: PCI address + * @size: size to write + * @val: value to be written + * + * Write config space to the specified device + */ +static int powernv_eeh_write_config(struct device_node *dn, int where, + int size, u32 val) +{ + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_controller *hose = edev->phb; + + hose = pci_bus_to_host(dev->bus); + + return hose->ops->write(dev->bus, dev->devfn, where, size, val); +} + +/** + * powernv_eeh_next_error - Retrieve next EEH error to handle + * @pe: Affected PE + * + * Using OPAL API, to retrieve next EEH error for EEH core to handle + */ +static int powernv_eeh_next_error(struct eeh_pe **pe) +{ + struct pci_controller *hose; + struct pnv_phb *phb = NULL; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + break; + } + + if (phb && phb->eeh_ops->next_error) + return phb->eeh_ops->next_error(pe); + + return -EEXIST; +} + +static struct eeh_ops powernv_eeh_ops = { + .name = "powernv", + .init = powernv_eeh_init, + .post_init = powernv_eeh_post_init, + .of_probe = NULL, + .dev_probe = powernv_eeh_dev_probe, + .set_option = powernv_eeh_set_option, + .get_pe_addr = powernv_eeh_get_pe_addr, + .get_state = powernv_eeh_get_state, + .reset = powernv_eeh_reset, + .wait_state = powernv_eeh_wait_state, + .get_log = powernv_eeh_get_log, + .configure_bridge = powernv_eeh_configure_bridge, + .read_config = powernv_eeh_read_config, + .write_config = powernv_eeh_write_config, + .next_error = powernv_eeh_next_error +}; + +/** + * eeh_powernv_init - Register platform dependent EEH operations + * + * EEH initialization on powernv platform. This function should be + * called before any EEH related functions. + */ +static int __init eeh_powernv_init(void) +{ + int ret = -EINVAL; + + if (!machine_is(powernv)) + return ret; + + ret = eeh_ops_register(&powernv_eeh_ops); + if (!ret) + pr_info("EEH: PowerNV platform initialized\n"); + else + pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret); + + return ret; +} + +early_initcall(eeh_powernv_init); From e9cc17d4ded2d10b332c7f3788d7817f7d9d01ef Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:14 +0800 Subject: [PATCH 068/156] powerpc/eeh: Initialization for PowerNV The patch initializes EEH for PowerNV platform. Because the OPAL APIs requires HUB ID, we need trace that through struct pnv_phb. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/pci-ioda.c | 16 +++++++++++++--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 6 ++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2931d97baa56..319ed61d674a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -974,6 +974,11 @@ static void pnv_pci_ioda_fixup(void) pnv_pci_ioda_setup_PEs(); pnv_pci_ioda_setup_seg(); pnv_pci_ioda_setup_DMA(); + +#ifdef CONFIG_EEH + eeh_addr_cache_build(); + eeh_init(); +#endif } /* @@ -1050,7 +1055,8 @@ static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) OPAL_ASSERT_RESET); } -void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type) +void __init pnv_pci_init_ioda_phb(struct device_node *np, + u64 hub_id, int ioda_type) { struct pci_controller *hose; static int primary = 1; @@ -1088,6 +1094,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type) hose->first_busno = 0; hose->last_busno = 0xff; hose->private_data = phb; + phb->hub_id = hub_id; phb->opal_id = phb_id; phb->type = ioda_type; @@ -1173,6 +1180,9 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type) phb->ioda.io_size, phb->ioda.io_segsize); phb->hose->ops = &pnv_pci_ops; +#ifdef CONFIG_EEH + phb->eeh_ops = &ioda_eeh_ops; +#endif /* Setup RID -> PE mapping function */ phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; @@ -1213,7 +1223,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type) void pnv_pci_init_ioda2_phb(struct device_node *np) { - pnv_pci_init_ioda_phb(np, PNV_PHB_IODA2); + pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); } void __init pnv_pci_init_ioda_hub(struct device_node *np) @@ -1236,6 +1246,6 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np) for_each_child_of_node(np, phbn) { /* Look for IODA1 PHBs */ if (of_device_is_compatible(phbn, "ibm,ioda-phb")) - pnv_pci_init_ioda_phb(phbn, PNV_PHB_IODA1); + pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); } } diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 5d378f2d9e26..b68db6325c1b 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -95,7 +95,7 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); } -static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, +static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, void *tce_mem, u64 tce_size) { struct pnv_phb *phb; @@ -136,6 +136,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, phb->hose->first_busno = 0; phb->hose->last_busno = 0xff; phb->hose->private_data = phb; + phb->hub_id = hub_id; phb->opal_id = phb_id; phb->type = PNV_PHB_P5IOC2; phb->model = PNV_PHB_MODEL_P5IOC2; @@ -229,7 +230,8 @@ void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) for_each_child_of_node(np, phbn) { if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) { - pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb); + pnv_pci_init_p5ioc2_phb(phbn, hub_id, + tce_mem, tce_per_phb); tce_mem += tce_per_phb; } } From be7e744607175fb1620f0390d20c880e16de163b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:15 +0800 Subject: [PATCH 069/156] powerpc/eeh: Enable EEH check for config access The patch enables EEH check and let EEH core to process the EEH errors for PowerNV platform while accessing config space. Originally, the implementation already had mechanism to check EEH errors and tried to recover from them. However, we never let EEH core to handle the EEH errors. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/pci.c | 40 +++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 5edceb7f746d..577cbeadb0ea 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include "powernv.h" #include "pci.h" @@ -260,6 +262,10 @@ static int pnv_pci_read_config(struct pci_bus *bus, { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; +#ifdef CONFIG_EEH + struct device_node *busdn, *dn; + struct eeh_pe *phb_pe = NULL; +#endif u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; s64 rc; @@ -292,8 +298,34 @@ static int pnv_pci_read_config(struct pci_bus *bus, cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n", bus->number, devfn, where, size, *val); - /* Check if the PHB got frozen due to an error (no response) */ + /* + * Check if the specified PE has been put into frozen + * state. On the other hand, we needn't do that while + * the PHB has been put into frozen state because of + * PHB-fatal errors. + */ +#ifdef CONFIG_EEH + phb_pe = eeh_phb_pe_get(hose); + if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED)) + return PCIBIOS_SUCCESSFUL; + + if (phb->eeh_enabled) { + if (*val == EEH_IO_ERROR_VALUE(size)) { + busdn = pci_bus_to_OF_node(bus); + for (dn = busdn->child; dn; dn = dn->sibling) { + struct pci_dn *pdn = PCI_DN(dn); + + if (pdn && pdn->devfn == devfn && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; + } + } + } else { + pnv_pci_config_check_eeh(phb, bus, bdfn); + } +#else pnv_pci_config_check_eeh(phb, bus, bdfn); +#endif return PCIBIOS_SUCCESSFUL; } @@ -324,8 +356,14 @@ static int pnv_pci_write_config(struct pci_bus *bus, default: return PCIBIOS_FUNC_NOT_SUPPORTED; } + /* Check if the PHB got frozen due to an error (no response) */ +#ifdef CONFIG_EEH + if (!phb->eeh_enabled) + pnv_pci_config_check_eeh(phb, bus, bdfn); +#else pnv_pci_config_check_eeh(phb, bus, bdfn); +#endif return PCIBIOS_SUCCESSFUL; } From b95cd2cd44b39cf11087b15f74e29ef9f2c6bf0f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 13:21:16 +0800 Subject: [PATCH 070/156] powerpc/eeh: Allow to check fenced PHB proactively It's meaningless to handle frozen PE if we already had fenced PHB. The patch intends to check the PHB state before checking PE. If the PHB has been put into fenced state, we need take care of that firstly. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh.c | 60 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 81cd0311dee8..7c567be3dd03 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -269,6 +269,58 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) return pa | (token & (PAGE_SIZE-1)); } +/* + * On PowerNV platform, we might already have fenced PHB there. + * For that case, it's meaningless to recover frozen PE. Intead, + * We have to handle fenced PHB firstly. + */ +static int eeh_phb_check_failure(struct eeh_pe *pe) +{ + struct eeh_pe *phb_pe; + unsigned long flags; + int ret; + + if (!eeh_probe_mode_dev()) + return -EPERM; + + /* Find the PHB PE */ + phb_pe = eeh_phb_pe_get(pe->phb); + if (!phb_pe) { + pr_warning("%s Can't find PE for PHB#%d\n", + __func__, pe->phb->global_number); + return -EEXIST; + } + + /* If the PHB has been in problematic state */ + eeh_serialize_lock(&flags); + if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) { + ret = 0; + goto out; + } + + /* Check PHB state */ + ret = eeh_ops->get_state(phb_pe, NULL); + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { + ret = 0; + goto out; + } + + /* Isolate the PHB and send event */ + eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + eeh_send_failure_event(phb_pe); + + WARN(1, "EEH: PHB failure detected\n"); + + return 1; +out: + eeh_serialize_unlock(flags); + return ret; +} + /** * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze * @edev: eeh device @@ -319,6 +371,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev) return 0; } + /* + * On PowerNV platform, we might already have fenced PHB + * there and we need take care of that firstly. + */ + ret = eeh_phb_check_failure(pe); + if (ret > 0) + return ret; + /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices From 1bc98de26d3f270e004421685a8e698e91cf95ca Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 18:13:22 +0800 Subject: [PATCH 071/156] powernv/opal: Notifier for OPAL events This patch implements a notifier to receive a notification on OPAL event mask changes. The notifier is only called as a result of an OPAL interrupt, which will happen upon reception of FSP messages or PCI errors. Any event mask change detected as a result of opal_poll_events() will not result in a notifier call. [benh: changelog] Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/opal.h | 5 ++ arch/powerpc/platforms/powernv/opal.c | 69 ++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 28807978bb46..029fe85722aa 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -644,6 +644,11 @@ extern void hvc_opal_init_early(void); extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); +extern int opal_notifier_register(struct notifier_block *nb); +extern void opal_notifier_enable(void); +extern void opal_notifier_disable(void); +extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val); + extern int opal_get_chars(uint32_t vtermno, char *buf, int count); extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 628c564ceadb..106301fd2fa5 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,10 @@ static DEFINE_SPINLOCK(opal_write_lock); extern u64 opal_mc_secondary_handler[]; static unsigned int *opal_irqs; static unsigned int opal_irq_count; +static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); +static DEFINE_SPINLOCK(opal_notifier_lock); +static uint64_t last_notified_mask = 0x0ul; +static atomic_t opal_notifier_hold = ATOMIC_INIT(0); int __init early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data) @@ -95,6 +100,68 @@ static int __init opal_register_exception_handlers(void) early_initcall(opal_register_exception_handlers); +int opal_notifier_register(struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + + atomic_notifier_chain_register(&opal_notifier_head, nb); + return 0; +} + +static void opal_do_notifier(uint64_t events) +{ + unsigned long flags; + uint64_t changed_mask; + + if (atomic_read(&opal_notifier_hold)) + return; + + spin_lock_irqsave(&opal_notifier_lock, flags); + changed_mask = last_notified_mask ^ events; + last_notified_mask = events; + spin_unlock_irqrestore(&opal_notifier_lock, flags); + + /* + * We feed with the event bits and changed bits for + * enough information to the callback. + */ + atomic_notifier_call_chain(&opal_notifier_head, + events, (void *)changed_mask); +} + +void opal_notifier_update_evt(uint64_t evt_mask, + uint64_t evt_val) +{ + unsigned long flags; + + spin_lock_irqsave(&opal_notifier_lock, flags); + last_notified_mask &= ~evt_mask; + last_notified_mask |= evt_val; + spin_unlock_irqrestore(&opal_notifier_lock, flags); +} + +void opal_notifier_enable(void) +{ + int64_t rc; + uint64_t evt = 0; + + atomic_set(&opal_notifier_hold, 0); + + /* Process pending events */ + rc = opal_poll_events(&evt); + if (rc == OPAL_SUCCESS && evt) + opal_do_notifier(evt); +} + +void opal_notifier_disable(void) +{ + atomic_set(&opal_notifier_hold, 1); +} + int opal_get_chars(uint32_t vtermno, char *buf, int count) { s64 len, rc; @@ -297,7 +364,7 @@ static irqreturn_t opal_interrupt(int irq, void *data) opal_handle_interrupt(virq_to_hw(irq), &events); - /* XXX TODO: Do something with the events */ + opal_do_notifier(events); return IRQ_HANDLED; } From e8e71fa426d72914c500e98afa2076628076f511 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 18:13:23 +0800 Subject: [PATCH 072/156] powernv/opal: Disable OPAL notifier upon poweroff While we're restarting or powering off the system, we needn't the OPAL notifier any more. So just to disable that. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/setup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index d4459bfc92f7..84438af96c05 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -93,6 +93,8 @@ static void __noreturn pnv_restart(char *cmd) { long rc = OPAL_BUSY; + opal_notifier_disable(); + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_reboot(); if (rc == OPAL_BUSY_EVENT) @@ -108,6 +110,8 @@ static void __noreturn pnv_power_off(void) { long rc = OPAL_BUSY; + opal_notifier_disable(); + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_power_down(0); if (rc == OPAL_BUSY_EVENT) From 7cb9d93dc6d4f717218b6fa791be9bcf4e417379 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 18:13:24 +0800 Subject: [PATCH 073/156] powerpc/eeh: Register OPAL notifier for PCI error The patch registers OPAL event notifier and process the PCI errors from firmware. If we have pending PCI errors, special EEH event (without binding PE) will be sent to EEH core for processing. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 41 ++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index a3eebd193dfe..2b7689ed5d18 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,26 @@ #endif static char *hub_diag = NULL; +static int ioda_eeh_nb_init = 0; + +static int ioda_eeh_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + uint64_t changed_evts = (uint64_t)change; + + /* We simply send special EEH event */ + if ((changed_evts & OPAL_EVENT_PCI_ERROR) && + (events & OPAL_EVENT_PCI_ERROR)) + eeh_send_failure_event(NULL); + + return 0; +} + +static struct notifier_block ioda_eeh_nb = { + .notifier_call = ioda_eeh_event, + .next = NULL, + .priority = 0 +}; /** * ioda_eeh_post_init - Chip dependent post initialization @@ -54,6 +75,19 @@ static char *hub_diag = NULL; static int ioda_eeh_post_init(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; + int ret; + + /* Register OPAL event notifier */ + if (!ioda_eeh_nb_init) { + ret = opal_notifier_register(&ioda_eeh_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + + ioda_eeh_nb_init = 1; + } /* FIXME: Enable it for PHB3 later */ if (phb->type == PNV_PHB_IODA1) { @@ -736,8 +770,13 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) long rc; int ret = 1; - /* While running here, it's safe to purge the event queue */ + /* + * While running here, it's safe to purge the event queue. + * And we should keep the cached OPAL notifier event sychronized + * between the kernel and firmware. + */ eeh_remove_event(NULL); + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { /* From 37c367f2792f899528b5bf201a4bd6131f8b75b6 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 18:13:25 +0800 Subject: [PATCH 074/156] powerpc/powernv: Debugfs directory for PHB The patch creates one debugfs directory ("powerpc/PCIxxxx") for each PHB so that we can hook EEH error injection debugfs entry there in proceeding patch. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/pci-ioda.c | 23 +++++++++++++++++++++++ arch/powerpc/platforms/powernv/pci.h | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 319ed61d674a..dc4ec7956967 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include "powernv.h" #include "pci.h" @@ -969,12 +971,33 @@ static void pnv_pci_ioda_setup_DMA(void) } } +static void pnv_pci_ioda_create_dbgfs(void) +{ +#ifdef CONFIG_DEBUG_FS + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + char name[16]; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + sprintf(name, "PCI%04x", hose->global_number); + phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + if (!phb->dbgfs) + pr_warning("%s: Error on creating debugfs on PHB#%x\n", + __func__, hose->global_number); + } +#endif /* CONFIG_DEBUG_FS */ +} + static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); pnv_pci_ioda_setup_seg(); pnv_pci_ioda_setup_DMA(); + pnv_pci_ioda_create_dbgfs(); + #ifdef CONFIG_EEH eeh_addr_cache_build(); eeh_init(); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 3656a2409e9a..43906e3ab26f 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -96,6 +96,10 @@ struct pnv_phb { int removed; #endif +#ifdef CONFIG_DEBUG_FS + struct dentry *dbgfs; +#endif + #ifdef CONFIG_PCI_MSI unsigned int msi_base; unsigned int msi32_support; From 8998897b8f966c9036307b5130494d4bf1e6cb18 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Jun 2013 18:13:26 +0800 Subject: [PATCH 075/156] powerpc/eeh: Debugfs for error injection The patch creates debugfs entries (powerpc/PCIxxxx/err_injct) for injecting EEH errors for testing purpose. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 2b7689ed5d18..84f3036511cb 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -64,6 +65,29 @@ static struct notifier_block ioda_eeh_nb = { .priority = 0 }; +#ifdef CONFIG_DEBUG_FS +static int ioda_eeh_dbgfs_set(void *data, u64 val) +{ + struct pci_controller *hose = data; + struct pnv_phb *phb = hose->private_data; + + out_be64(phb->regs + 0xD10, val); + return 0; +} + +static int ioda_eeh_dbgfs_get(void *data, u64 *val) +{ + struct pci_controller *hose = data; + struct pnv_phb *phb = hose->private_data; + + *val = in_be64(phb->regs + 0xD10); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_dbgfs_ops, ioda_eeh_dbgfs_get, + ioda_eeh_dbgfs_set, "0x%llx\n"); +#endif /* CONFIG_DEBUG_FS */ + /** * ioda_eeh_post_init - Chip dependent post initialization * @hose: PCI controller @@ -101,6 +125,13 @@ static int ioda_eeh_post_init(struct pci_controller *hose) } } +#ifdef CONFIG_DEBUG_FS + if (phb->dbgfs) + debugfs_create_file("err_injct", 0600, + phb->dbgfs, hose, + &ioda_eeh_dbgfs_ops); +#endif + phb->eeh_enabled = 1; } From db3d8534903c8a9617142975bec6db95acaba753 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:13 +0530 Subject: [PATCH 076/156] powerpc/mm: handle hugepage size correctly when invalidating hpte entries If a hash bucket gets full, we "evict" a more/less random entry from it. When we do that we don't invalidate the TLB (hpte_remove) because we assume the old translation is still technically "valid". This implies that when we are invalidating or updating pte, even if HPTE entry is not valid we should do a tlb invalidate. With hugepages, we need to pass the correct actual page size value for tlb invalidation. This change update the patch 0608d692463598c1d6e826d9dd7283381b4f246c "powerpc/mm: Always invalidate tlb on hpte invalidate and update" to handle transparent hugepages correctly. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/machdep.h | 8 +- arch/powerpc/kvm/book3s_64_mmu_host.c | 2 +- arch/powerpc/mm/hash_low_64.S | 21 ++-- arch/powerpc/mm/hash_native_64.c | 122 +++++++++--------------- arch/powerpc/mm/hash_utils_64.c | 9 +- arch/powerpc/mm/hugetlbpage-hash64.c | 2 +- arch/powerpc/platforms/cell/beat_htab.c | 16 ++-- arch/powerpc/platforms/ps3/htab.c | 5 +- arch/powerpc/platforms/pseries/lpar.c | 17 +++- 9 files changed, 95 insertions(+), 107 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 92386fc4e82a..801e3c61395f 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -36,13 +36,13 @@ struct machdep_calls { #ifdef CONFIG_PPC64 void (*hpte_invalidate)(unsigned long slot, unsigned long vpn, - int psize, int ssize, - int local); + int bpsize, int apsize, + int ssize, int local); long (*hpte_updatepp)(unsigned long slot, unsigned long newpp, unsigned long vpn, - int psize, int ssize, - int local); + int bpsize, int apsize, + int ssize, int local); void (*hpte_updateboltedpp)(unsigned long newpp, unsigned long ea, int psize, int ssize); diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 3a9a1aceb14f..176d3fd53b73 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -34,7 +34,7 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, - MMU_PAGE_4K, MMU_SEGSIZE_256M, + MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M, false); } diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index 0e980acae67c..d3cbda62857b 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -289,9 +289,10 @@ htab_modify_pte: /* Call ppc_md.hpte_updatepp */ mr r5,r29 /* vpn */ - li r6,MMU_PAGE_4K /* page size */ - ld r7,STK_PARAM(R9)(r1) /* segment size */ - ld r8,STK_PARAM(R8)(r1) /* get "local" param */ + li r6,MMU_PAGE_4K /* base page size */ + li r7,MMU_PAGE_4K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "local" param */ _GLOBAL(htab_call_hpte_updatepp) bl . /* Patched by htab_finish_init() */ @@ -649,9 +650,10 @@ htab_modify_pte: /* Call ppc_md.hpte_updatepp */ mr r5,r29 /* vpn */ - li r6,MMU_PAGE_4K /* page size */ - ld r7,STK_PARAM(R9)(r1) /* segment size */ - ld r8,STK_PARAM(R8)(r1) /* get "local" param */ + li r6,MMU_PAGE_4K /* base page size */ + li r7,MMU_PAGE_4K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "local" param */ _GLOBAL(htab_call_hpte_updatepp) bl . /* patched by htab_finish_init() */ @@ -937,9 +939,10 @@ ht64_modify_pte: /* Call ppc_md.hpte_updatepp */ mr r5,r29 /* vpn */ - li r6,MMU_PAGE_64K - ld r7,STK_PARAM(R9)(r1) /* segment size */ - ld r8,STK_PARAM(R8)(r1) /* get "local" param */ + li r6,MMU_PAGE_64K /* base page size */ + li r7,MMU_PAGE_64K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "local" param */ _GLOBAL(ht64_call_hpte_updatepp) bl . /* patched by htab_finish_init() */ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 4c122c3f1623..6d152bc993e5 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -273,61 +273,15 @@ static long native_hpte_remove(unsigned long hpte_group) return i; } -static inline int __hpte_actual_psize(unsigned int lp, int psize) -{ - int i, shift; - unsigned int mask; - - /* start from 1 ignoring MMU_PAGE_4K */ - for (i = 1; i < MMU_PAGE_COUNT; i++) { - - /* invalid penc */ - if (mmu_psize_defs[psize].penc[i] == -1) - continue; - /* - * encoding bits per actual page size - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ....... - */ - shift = mmu_psize_defs[i].shift - LP_SHIFT; - if (shift > LP_BITS) - shift = LP_BITS; - mask = (1 << shift) - 1; - if ((lp & mask) == mmu_psize_defs[psize].penc[i]) - return i; - } - return -1; -} - -static inline int hpte_actual_psize(struct hash_pte *hptep, int psize) -{ - /* Look at the 8 bit LP value */ - unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1); - - if (!(hptep->v & HPTE_V_VALID)) - return -1; - - /* First check if it is large page */ - if (!(hptep->v & HPTE_V_LARGE)) - return MMU_PAGE_4K; - - return __hpte_actual_psize(lp, psize); -} - static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long vpn, int psize, int ssize, - int local) + unsigned long vpn, int bpsize, + int apsize, int ssize, int local) { struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0; - int actual_psize; - want_v = hpte_encode_avpn(vpn, psize, ssize); + want_v = hpte_encode_avpn(vpn, bpsize, ssize); DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", vpn, want_v & HPTE_V_AVPN, slot, newpp); @@ -335,7 +289,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, native_lock_hpte(hptep); hpte_v = hptep->v; - actual_psize = hpte_actual_psize(hptep, psize); /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -343,12 +296,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, * (hpte_remove) because we assume the old translation is still * technically "valid". */ - if (actual_psize < 0) { - actual_psize = psize; - ret = -1; - goto err_out; - } - if (!HPTE_V_COMPARE(hpte_v, want_v)) { + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { DBG_LOW(" -> miss\n"); ret = -1; } else { @@ -357,11 +305,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); } -err_out: native_unlock_hpte(hptep); /* Ensure it is out of the tlb too. */ - tlbie(vpn, psize, actual_psize, ssize, local); + tlbie(vpn, bpsize, apsize, ssize, local); return ret; } @@ -402,7 +349,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - int actual_psize; unsigned long vpn; unsigned long vsid; long slot; @@ -415,36 +361,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, if (slot == -1) panic("could not find page to bolt\n"); hptep = htab_address + slot; - actual_psize = hpte_actual_psize(hptep, psize); - if (actual_psize < 0) - actual_psize = psize; /* Update the HPTE */ hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | (newpp & (HPTE_R_PP | HPTE_R_N)); - - /* Ensure it is out of the tlb too. */ - tlbie(vpn, psize, actual_psize, ssize, 0); + /* + * Ensure it is out of the tlb too. Bolted entries base and + * actual page size will be same. + */ + tlbie(vpn, psize, psize, ssize, 0); } static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, - int psize, int ssize, int local) + int bpsize, int apsize, int ssize, int local) { struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v; unsigned long want_v; unsigned long flags; - int actual_psize; local_irq_save(flags); DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); - want_v = hpte_encode_avpn(vpn, psize, ssize); + want_v = hpte_encode_avpn(vpn, bpsize, ssize); native_lock_hpte(hptep); hpte_v = hptep->v; - actual_psize = hpte_actual_psize(hptep, psize); /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -452,23 +395,48 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, * (hpte_remove) because we assume the old translation is still * technically "valid". */ - if (actual_psize < 0) { - actual_psize = psize; - native_unlock_hpte(hptep); - goto err_out; - } - if (!HPTE_V_COMPARE(hpte_v, want_v)) + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); else /* Invalidate the hpte. NOTE: this also unlocks it */ hptep->v = 0; -err_out: /* Invalidate the TLB */ - tlbie(vpn, psize, actual_psize, ssize, local); + tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(flags); } +static inline int __hpte_actual_psize(unsigned int lp, int psize) +{ + int i, shift; + unsigned int mask; + + /* start from 1 ignoring MMU_PAGE_4K */ + for (i = 1; i < MMU_PAGE_COUNT; i++) { + + /* invalid penc */ + if (mmu_psize_defs[psize].penc[i] == -1) + continue; + /* + * encoding bits per actual page size + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ....... + */ + shift = mmu_psize_defs[i].shift - LP_SHIFT; + if (shift > LP_BITS) + shift = LP_BITS; + mask = (1 << shift) - 1; + if ((lp & mask) == mmu_psize_defs[psize].penc[i]) + return i; + } + return -1; +} + static void hpte_decode(struct hash_pte *hpte, unsigned long slot, int *psize, int *apsize, int *ssize, unsigned long *vpn) { diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e303a6d74e3a..2f470809876f 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1232,7 +1232,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); - ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local); + /* + * We use same base page size and actual psize, because we don't + * use these functions for hugepage + */ + ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local); } pte_iterate_hashed_end(); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -1365,7 +1369,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; - ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0); + ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize, + mmu_kernel_ssize, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 0f1d94a1fb82..0b7fb6761015 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, slot += (old_pte & _PAGE_F_GIX) >> 12; if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, - ssize, local) == -1) + mmu_psize, ssize, local) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index 246e1d8b3af3..c34ee4e60873 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -185,7 +185,8 @@ static void beat_lpar_hptab_clear(void) static long beat_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, unsigned long vpn, - int psize, int ssize, int local) + int psize, int apsize, + int ssize, int local) { unsigned long lpar_rc; u64 dummy0, dummy1; @@ -274,7 +275,8 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp, } static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, - int psize, int ssize, int local) + int psize, int apsize, + int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; @@ -364,9 +366,10 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, * already zero. For now I am paranoid. */ static long beat_lpar_hpte_updatepp_v3(unsigned long slot, - unsigned long newpp, - unsigned long vpn, - int psize, int ssize, int local) + unsigned long newpp, + unsigned long vpn, + int psize, int apsize, + int ssize, int local) { unsigned long lpar_rc; unsigned long want_v; @@ -394,7 +397,8 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot, } static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn, - int psize, int ssize, int local) + int psize, int apsize, + int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 177a2f70700c..3e270e3412ae 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -109,7 +109,8 @@ static long ps3_hpte_remove(unsigned long hpte_group) } static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long vpn, int psize, int ssize, int local) + unsigned long vpn, int psize, int apsize, + int ssize, int local) { int result; u64 hpte_v, want_v, hpte_rs; @@ -162,7 +163,7 @@ static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, } static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, - int psize, int ssize, int local) + int psize, int apsize, int ssize, int local) { unsigned long flags; int result; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 6d62072a7d5a..ca45c8f3c0ba 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -240,7 +240,8 @@ static void pSeries_lpar_hptab_clear(void) static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, unsigned long vpn, - int psize, int ssize, int local) + int psize, int apsize, + int ssize, int local) { unsigned long lpar_rc; unsigned long flags = (newpp & 7) | H_AVPN; @@ -328,7 +329,8 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, } static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, - int psize, int ssize, int local) + int psize, int apsize, + int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; @@ -356,8 +358,10 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea, slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - - pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0); + /* + * lpar doesn't use the passed actual page size + */ + pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0); } /* Flag bits for H_BULK_REMOVE */ @@ -400,8 +404,11 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { + /* + * lpar doesn't use the passed actual page size + */ pSeries_lpar_hpte_invalidate(slot, vpn, psize, - ssize, local); + 0, ssize, local); } else { param[pix] = HBR_REQUEST | HBR_AVPN | slot; param[pix+1] = hpte_encode_avpn(vpn, psize, From f940f5289873af2ad2c4e73f88c24ad2b8fe3f87 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:14 +0530 Subject: [PATCH 077/156] powerpc/THP: Double the PMD table size for THP THP code does PTE page allocation along with large page request and deposit them for later use. This is to ensure that we won't have any failures when we split hugepages to regular pages. On powerpc we want to use the deposited PTE page for storing hash pte slot and secondary bit information for the HPTEs. We use the second half of the pmd table to save the deposted PTE page. Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgalloc-64.h | 6 +++--- arch/powerpc/include/asm/pgtable-ppc64-64k.h | 3 ++- arch/powerpc/include/asm/pgtable-ppc64.h | 6 +++++- arch/powerpc/mm/init_64.c | 9 ++++++--- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index b66ae722a8e9..f65e27b09bd3 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -221,17 +221,17 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE), + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL|__GFP_REPEAT); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd); + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); } #define __pmd_free_tlb(tlb, pmd, addr) \ - pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE) + pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) #ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h index 45142d640720..a56b82fb0609 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h +++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h @@ -33,7 +33,8 @@ #define PGDIR_MASK (~(PGDIR_SIZE-1)) /* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0x1ff +/* PMDs point to PTE table fragments which are 4K aligned. */ +#define PMD_MASKED_BITS 0xfff /* Bits to mask out from a PGD/PUD to get to the PMD page */ #define PUD_MASKED_BITS 0x1ff diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e3d55f6f24fe..ab843328b47f 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -20,7 +20,11 @@ PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) - +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1) +#else +#define PMD_CACHE_INDEX PMD_INDEX_SIZE +#endif /* * Define the address range of the kernel non-linear virtual area */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a90b9c458990..d0cd9e4c6837 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -88,7 +88,11 @@ static void pgd_ctor(void *addr) static void pmd_ctor(void *addr) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + memset(addr, 0, PMD_TABLE_SIZE * 2); +#else memset(addr, 0, PMD_TABLE_SIZE); +#endif } struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; @@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) void pgtable_cache_init(void) { pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); - pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); - if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) + pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX)) panic("Couldn't allocate pgtable caches"); - /* In all current configs, when the PUD index exists it's the * same size as either the pgd or pmd index. Verify that the * initialization above has also created a PUD cache. This From 074c2eae3e9b66c03a17a12df8f2cd19382b68ab Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:15 +0530 Subject: [PATCH 078/156] powerpc/THP: Implement transparent hugepages for ppc64 We now have pmd entries covering 16MB range and the PMD table double its original size. We use the second half of the PMD table to deposit the pgtable (PTE page). The depoisted PTE page is further used to track the HPTE information. The information include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need 4096 entries. Both will fit in a 4K PTE page. On hugepage invalidate we need to walk the PTE page and invalidate all valid HPTEs. This patch implements necessary arch specific functions for THP support and also hugepage invalidate logic. These PMD related functions are intentionally kept similar to their PTE counter-part. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc64.h | 215 ++++++++++++- arch/powerpc/include/asm/pgtable.h | 4 + arch/powerpc/include/asm/tlbflush.h | 3 +- arch/powerpc/mm/pgtable_64.c | 377 +++++++++++++++++++++++ arch/powerpc/mm/tlb_hash64.c | 27 ++ arch/powerpc/platforms/Kconfig.cputype | 1 + 6 files changed, 625 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index ab843328b47f..8f9da5e32fea 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -10,6 +10,7 @@ #else #include #endif +#include #define FIRST_USER_ADDRESS 0 @@ -154,7 +155,7 @@ #define pmd_present(pmd) (pmd_val(pmd) != 0) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) #define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS) -#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd)) +extern struct page *pmd_page(pmd_t pmd); #define pud_set(pudp, pudval) (pud_val(*(pudp)) = (pudval)) #define pud_none(pud) (!pud_val(pud)) @@ -382,4 +383,216 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, #endif /* __ASSEMBLY__ */ +/* + * THP pages can't be special. So use the _PAGE_SPECIAL + */ +#define _PAGE_SPLITTING _PAGE_SPECIAL + +/* + * We need to differentiate between explicit huge page and THP huge + * page, since THP huge page also need to track real subpage details + */ +#define _PAGE_THP_HUGE _PAGE_4K_PFN + +/* + * set of bits not changed in pmd_modify. + */ +#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \ + _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \ + _PAGE_THP_HUGE) + +#ifndef __ASSEMBLY__ +/* + * The linux hugepage PMD now include the pmd entries followed by the address + * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits. + * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per + * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and + * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t. + * + * The last three bits are intentionally left to zero. This memory location + * are also used as normal page PTE pointers. So if we have any pointers + * left around while we collapse a hugepage, we need to make sure + * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them + */ +static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) +{ + return (hpte_slot_array[index] >> 3) & 0x1; +} + +static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array, + int index) +{ + return hpte_slot_array[index] >> 4; +} + +static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, + unsigned int index, unsigned int hidx) +{ + hpte_slot_array[index] = hidx << 4 | 0x1 << 3; +} + +static inline char *get_hpte_slot_array(pmd_t *pmdp) +{ + /* + * The hpte hindex is stored in the pgtable whose address is in the + * second half of the PMD + * + * Order this load with the test for pmd_trans_huge in the caller + */ + smp_rmb(); + return *(char **)(pmdp + PTRS_PER_PMD); + + +} + +extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); +extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); +extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd); + +static inline int pmd_trans_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE); +} + +static inline int pmd_large(pmd_t pmd) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + if (pmd_trans_huge(pmd)) + return pmd_val(pmd) & _PAGE_PRESENT; + return 0; +} + +static inline int pmd_trans_splitting(pmd_t pmd) +{ + if (pmd_trans_huge(pmd)) + return pmd_val(pmd) & _PAGE_SPLITTING; + return 0; +} + +/* We will enable it in the last patch */ +#define has_transparent_hugepage() 0 +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline pte_t pmd_pte(pmd_t pmd) +{ + return __pte(pmd_val(pmd)); +} + +static inline pmd_t pte_pmd(pte_t pte) +{ + return __pmd(pte_val(pte)); +} + +static inline pte_t *pmdp_ptep(pmd_t *pmd) +{ + return (pte_t *)pmd; +} + +#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) +#define pmd_young(pmd) pte_young(pmd_pte(pmd)) +#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) +#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) +#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) +#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) +#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) + +#define __HAVE_ARCH_PMD_WRITE +#define pmd_write(pmd) pte_write(pmd_pte(pmd)) + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + /* Do nothing, mk_pmd() does this part. */ + return pmd; +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + pmd_val(pmd) &= ~_PAGE_PRESENT; + return pmd; +} + +static inline pmd_t pmd_mksplitting(pmd_t pmd) +{ + pmd_val(pmd) |= _PAGE_SPLITTING; + return pmd; +} + +#define __HAVE_ARCH_PMD_SAME +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0); +} + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +extern unsigned long pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmdp, unsigned long clr); + +static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + unsigned long old; + + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) + return 0; + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED); + return ((old & _PAGE_ACCESSED) != 0); +} + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +extern pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + + if ((pmd_val(*pmdp) & _PAGE_RW) == 0) + return; + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW); +} + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + +#define __HAVE_ARCH_PGTABLE_DEPOSIT +extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +#define __HAVE_ARCH_PGTABLE_WITHDRAW +extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_INVALIDATE +extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); +#endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 7aeb9555f6ea..d53db937ec75 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -220,6 +220,10 @@ extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr, extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr); +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +#define pmd_large(pmd) 0 +#define has_transparent_hugepage() 0 +#endif #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 61a59271665b..2def01ed0cb2 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -165,7 +165,8 @@ static inline void flush_tlb_kernel_range(unsigned long start, /* Private function for use by PCI IO mapping code */ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end); - +extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr); #else #error Unsupported MMU type #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index a854096e1023..e4d3e9fb59be 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(__iounmap); EXPORT_SYMBOL(__iounmap_at); +/* + * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags + * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. + */ +struct page *pmd_page(pmd_t pmd) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(pmd)) + return pfn_to_page(pmd_pfn(pmd)); +#endif + return virt_to_page(pmd_page_vaddr(pmd)); +} + #ifdef CONFIG_PPC_64K_PAGES static pte_t *get_from_cache(struct mm_struct *mm) { @@ -455,3 +468,367 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) } #endif #endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr) +{ + + unsigned long old, tmp; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + +#ifdef PTE_ATOMIC_UPDATES + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + andi. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY) + : "cc" ); +#else + old = pmd_val(*pmdp); + *pmdp = __pmd(old & ~clr); +#endif + if (old & _PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp); + return old; +} + +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (pmd_trans_huge(*pmdp)) { + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + } else { + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + } + return pmd; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. The generic routines only flush if the + * entry was young or dirty which is not good enough. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + */ +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We mark the pmd splitting and invalidate all the hpte + * entries for this hugepage. + */ +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + unsigned long old, tmp; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + +#ifdef PTE_ATOMIC_UPDATES + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + andi. %1,%0,%6\n\ + bne- 1b \n\ + ori %1,%0,%4 \n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) + : "cc" ); +#else + old = pmd_val(*pmdp); + *pmdp = __pmd(old | _PAGE_SPLITTING); +#endif + /* + * If we didn't had the splitting flag set, go and flush the + * HPTE entries. + */ + if (!(old & _PAGE_SPLITTING)) { + /* We need to flush the hpte */ + if (old & _PAGE_HASHPTE) + hpte_do_hugepage_flush(vma->vm_mm, address, pmdp); + } +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_none(*pmdp)); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + int ssize, i; + unsigned long s_addr; + unsigned int psize, valid; + unsigned char *hpte_slot_array; + unsigned long hidx, vpn, vsid, hash, shift, slot; + + /* + * Flush all the hptes mapping this hugepage + */ + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_page_pmd + */ + if (!hpte_slot_array) + return; + + /* get the base page size */ + psize = get_slice_psize(mm, s_addr); + shift = mmu_psize_defs[psize].shift; + + for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, 0); + } +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + pmd_val(pmd) |= pgprot_val(pgprot); + return pmd; +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + pmd_t pmd; + /* + * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always + * set. We use this to check THP page at pmd level. + * leaf pte for huge page, bottom two bits != 00 + */ + pmd_val(pmd) = pfn << PTE_RPN_SHIFT; + pmd_val(pmd) |= _PAGE_THP_HUGE; + pmd = pmd_set_protbits(pmd, pgprot); + return pmd; +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + + pmd_val(pmd) &= _HPAGE_CHG_MASK; + pmd = pmd_set_protbits(pmd, newprot); + return pmd; +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} + +pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return old_pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 023ec8a13f38..48bf63ea6525 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -219,3 +219,30 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, arch_leave_lazy_mmu_mode(); local_irq_restore(flags); } + +void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + pte_t *start_pte; + unsigned long flags; + + addr = _ALIGN_DOWN(addr, PMD_SIZE); + /* Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + start_pte = pte_offset_map(pmd, addr); + for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { + unsigned long pteval = pte_val(*pte); + if (pteval & _PAGE_HASHPTE) + hpte_need_flush(mm, addr, pte, pteval, 0); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 54f3936001aa..ae0aaea9e098 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -71,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS + select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES config PPC_BOOK3E_64 bool "Embedded processors" From 29409997f8d06d693d82127d200eeaf48989fdd2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:16 +0530 Subject: [PATCH 079/156] powerpc: move find_linux_pte_or_hugepte and gup_hugepte to common code We will use this in the later patch for handling THP pages Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/hugetlb.h | 8 +- arch/powerpc/include/asm/pgtable-ppc64.h | 13 -- arch/powerpc/include/asm/pgtable.h | 2 + arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/hugetlbpage.c | 251 ++++++++++++----------- 5 files changed, 138 insertions(+), 138 deletions(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index f2498c8e595d..d750336b171d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -191,8 +191,14 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { } -#endif /* CONFIG_HUGETLB_PAGE */ +#define hugepd_shift(x) 0 +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, + unsigned pdshift) +{ + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ /* * FSL Book3E platforms require special gpage handling - the gpages diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 8f9da5e32fea..6c9323f3ab54 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -368,19 +368,6 @@ static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) return pt; } -#ifdef CONFIG_HUGETLB_PAGE -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - unsigned *shift); -#else -static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - unsigned *shift) -{ - if (shift) - *shift = 0; - return find_linux_pte(pgdir, ea); -} -#endif /* !CONFIG_HUGETLB_PAGE */ - #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index d53db937ec75..959d575c37dd 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -224,6 +224,8 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #define pmd_large(pmd) 0 #define has_transparent_hugepage() 0 #endif +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift); #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 26f29a772414..ff0379cdeeca 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -27,8 +27,8 @@ obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-y += hugetlbpage.o +ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 237c8e5f2640..2865077e0159 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -21,6 +21,9 @@ #include #include #include +#include + +#ifdef CONFIG_HUGETLB_PAGE #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 @@ -100,66 +103,6 @@ int pgd_huge(pgd_t pgd) } #endif -/* - * We have 4 cases for pgds and pmds: - * (1) invalid (all zeroes) - * (2) pointer to next table, as normal; bottom 6 bits == 0 - * (3) leaf pte for huge page, bottom two bits != 00 - * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table - */ -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *ret_pte; - hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; - - if (shift) - *shift = 0; - - pg = pgdir + pgd_index(ea); - - if (pgd_huge(*pg)) { - ret_pte = (pte_t *) pg; - goto out; - } else if (is_hugepd(pg)) - hpdp = (hugepd_t *)pg; - else if (!pgd_none(*pg)) { - pdshift = PUD_SHIFT; - pu = pud_offset(pg, ea); - - if (pud_huge(*pu)) { - ret_pte = (pte_t *) pu; - goto out; - } else if (is_hugepd(pu)) - hpdp = (hugepd_t *)pu; - else if (!pud_none(*pu)) { - pdshift = PMD_SHIFT; - pm = pmd_offset(pu, ea); - - if (pmd_huge(*pm)) { - ret_pte = (pte_t *) pm; - goto out; - } else if (is_hugepd(pm)) - hpdp = (hugepd_t *)pm; - else if (!pmd_none(*pm)) - return pte_offset_kernel(pm, ea); - } - } - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); -out: - if (shift) - *shift = pdshift; - return ret_pte; -} -EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); - pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); @@ -753,69 +696,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask; - unsigned long pte_end; - struct page *head, *page, *tail; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = *ptep; - mask = _PAGE_PRESENT | _PAGE_USER; - if (write) - mask |= _PAGE_RW; - - if ((pte_val(pte) & mask) != mask) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - tail = page; - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - - return 1; -} - static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { @@ -1032,3 +912,128 @@ void flush_dcache_icache_hugepage(struct page *page) } } } + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page, bottom two bits != 00 + * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table + */ +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (shift) + *shift = 0; + + pg = pgdir + pgd_index(ea); + + if (pgd_huge(*pg)) { + ret_pte = (pte_t *) pg; + goto out; + } else if (is_hugepd(pg)) + hpdp = (hugepd_t *)pg; + else if (!pgd_none(*pg)) { + pdshift = PUD_SHIFT; + pu = pud_offset(pg, ea); + + if (pud_huge(*pu)) { + ret_pte = (pte_t *) pu; + goto out; + } else if (is_hugepd(pu)) + hpdp = (hugepd_t *)pu; + else if (!pud_none(*pu)) { + pdshift = PMD_SHIFT; + pm = pmd_offset(pu, ea); + + if (pmd_huge(*pm)) { + ret_pte = (pte_t *) pm; + goto out; + } else if (is_hugepd(pm)) + hpdp = (hugepd_t *)pm; + else if (!pmd_none(*pm)) + return pte_offset_kernel(pm, ea); + } + } + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (shift) + *shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); + +int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page, *tail; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = *ptep; + mask = _PAGE_PRESENT | _PAGE_USER; + if (write) + mask |= _PAGE_RW; + + if ((pte_val(pte) & mask) != mask) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} From ac52ae4721233150a3c30e9732a1c1f4f68e7db7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:17 +0530 Subject: [PATCH 080/156] powerpc: Update find_linux_pte_or_hugepte to handle transparent hugepages Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugetlbpage.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 2865077e0159..49282045ee96 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -936,30 +936,50 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift pg = pgdir + pgd_index(ea); - if (pgd_huge(*pg)) { + /* + * we should first check for none. That takes care of a + * a parallel hugetlb or THP pagefault moving none entries + * to respective types. + */ + if (pgd_none(*pg)) + return NULL; + else if (pgd_huge(*pg)) { ret_pte = (pte_t *) pg; goto out; } else if (is_hugepd(pg)) hpdp = (hugepd_t *)pg; - else if (!pgd_none(*pg)) { + else { pdshift = PUD_SHIFT; pu = pud_offset(pg, ea); - if (pud_huge(*pu)) { + if (pud_none(*pu)) + return NULL; + else if (pud_huge(*pu)) { ret_pte = (pte_t *) pu; goto out; } else if (is_hugepd(pu)) hpdp = (hugepd_t *)pu; - else if (!pud_none(*pu)) { + else { pdshift = PMD_SHIFT; pm = pmd_offset(pu, ea); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + * + * A hugepage split is captured by pmd_trans_splitting + * because we mark the pmd trans splitting and do a + * hpte invalidate + * + */ + if (pmd_none(*pm) || pmd_trans_splitting(*pm)) + return NULL; - if (pmd_huge(*pm)) { + if (pmd_huge(*pm) || pmd_large(*pm)) { ret_pte = (pte_t *) pm; goto out; } else if (is_hugepd(pm)) hpdp = (hugepd_t *)pm; - else if (!pmd_none(*pm)) + else return pte_offset_kernel(pm, ea); } } From 12bc9f6fc1d6582b4529ac522d2231bd2584a5f1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:18 +0530 Subject: [PATCH 081/156] powerpc: Replace find_linux_pte with find_linux_pte_or_hugepte Replace find_linux_pte with find_linux_pte_or_hugepte and explicitly document why we don't need to handle transparent hugepages at callsites. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc64.h | 24 ------------------------ arch/powerpc/kernel/eeh.c | 7 ++++++- arch/powerpc/kernel/io-workarounds.c | 11 +++++++++-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- arch/powerpc/mm/hash_utils_64.c | 8 +++++++- arch/powerpc/mm/hugetlbpage.c | 8 ++++++-- arch/powerpc/mm/tlb_hash64.c | 9 +++++++-- 7 files changed, 36 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 6c9323f3ab54..e71bd25d62d7 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -344,30 +344,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); - -/* - * find_linux_pte returns the address of a linux pte for a given - * effective address and directory. If not found, it returns zero. - */ -static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *pt = NULL; - - pg = pgdir + pgd_index(ea); - if (!pgd_none(*pg)) { - pu = pud_offset(pg, ea); - if (!pud_none(*pu)) { - pm = pmd_offset(pu, ea); - if (pmd_present(*pm)) - pt = pte_offset_kernel(pm, ea); - } - } - return pt; -} - #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 7c567be3dd03..af2b9ae07df5 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -260,10 +260,15 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) { pte_t *ptep; unsigned long pa; + int hugepage_shift; - ptep = find_linux_pte(init_mm.pgd, token); + /* + * We won't find hugepages here, iomem + */ + ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); if (!ptep) return token; + WARN_ON(hugepage_shift); pa = pte_pfn(*ptep) << PAGE_SHIFT; return pa | (token & (PAGE_SIZE-1)); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 50e90b7e7139..fa0b54b2a362 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) { + unsigned hugepage_shift; struct iowa_bus *bus; int token; @@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - ptep = find_linux_pte(init_mm.pgd, vaddr); + ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, + &hugepage_shift); if (ptep == NULL) paddr = 0; - else + else { + /* + * we don't have hugepages backing iomem + */ + WARN_ON(hugepage_shift); paddr = pte_pfn(*ptep) << PAGE_SHIFT; + } bus = iowa_pci_find(vaddr, paddr); if (bus == NULL) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 6dcbb49105a4..dcf892d25a56 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x) unsigned long addr = (unsigned long) x; pte_t *p; - p = find_linux_pte(swapper_pg_dir, addr); + p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); if (!p || !pte_present(*p)) return NULL; /* assume we don't have huge pages in vmalloc space... */ diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 2f470809876f..e8434ca6efd4 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1145,6 +1145,7 @@ EXPORT_SYMBOL_GPL(hash_page); void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { + int hugepage_shift; unsigned long vsid; pgd_t *pgdir; pte_t *ptep; @@ -1166,10 +1167,15 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pgdir = mm->pgd; if (pgdir == NULL) return; - ptep = find_linux_pte(pgdir, ea); + /* + * THP pages use update_mmu_cache_pmd. We don't do + * hash preload there. Hence can ignore THP here + */ + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); if (!ptep) return; + WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 49282045ee96..8add58061003 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -105,6 +105,7 @@ int pgd_huge(pgd_t pgd) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { + /* Only called for hugetlbfs pages, hence can ignore THP */ return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); } @@ -673,11 +674,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) struct page *page; unsigned shift; unsigned long mask; - + /* + * Transparent hugepages are handled by generic code. We can skip them + * here. + */ ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!ptep || !shift) + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) return ERR_PTR(-EINVAL); mask = (1UL << shift) - 1; diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 48bf63ea6525..313c85c5aa90 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -189,6 +189,7 @@ void tlb_flush(struct mmu_gather *tlb) void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end) { + int hugepage_shift; unsigned long flags; start = _ALIGN_DOWN(start, PAGE_SIZE); @@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte(mm->pgd, start); + pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, + &hugepage_shift); unsigned long pte; if (ptep == NULL) @@ -214,7 +216,10 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (!(pte & _PAGE_HASHPTE)) continue; - hpte_need_flush(mm, start, ptep, pte, 0); + if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) + hpte_do_hugepage_flush(mm, start, (pmd_t *)pte); + else + hpte_need_flush(mm, start, ptep, pte, 0); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); From db7cb5b92409b36e4338355fbc3561b3f6801c7b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:19 +0530 Subject: [PATCH 082/156] powerpc/kvm: Handle transparent hugepage in KVM We can find pte that are splitting while walking page tables. Return None pte in that case. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/kvm_book3s_64.h | 56 ++++++++++++++---------- arch/powerpc/kvm/book3s_64_mmu_hv.c | 8 ++-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 12 +++-- 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 9c1ff330c805..a1ecb14e4442 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -159,36 +159,46 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) } /* - * Lock and read a linux PTE. If it's present and writable, atomically - * set dirty and referenced bits and return the PTE, otherwise return 0. + * If it's present and writable, atomically set dirty and referenced bits and + * return the PTE, otherwise return 0. If we find a transparent hugepage + * and if it is marked splitting we return 0; */ -static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) +static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, + unsigned int hugepage) { - pte_t pte, tmp; + pte_t old_pte, new_pte = __pte(0); - /* wait until _PAGE_BUSY is clear then set it atomically */ - __asm__ __volatile__ ( - "1: ldarx %0,0,%3\n" - " andi. %1,%0,%4\n" - " bne- 1b\n" - " ori %1,%0,%4\n" - " stdcx. %1,0,%3\n" - " bne- 1b" - : "=&r" (pte), "=&r" (tmp), "=m" (*p) - : "r" (p), "i" (_PAGE_BUSY) - : "cc"); + while (1) { + old_pte = pte_val(*ptep); + /* + * wait until _PAGE_BUSY is clear then set it atomically + */ + if (unlikely(old_pte & _PAGE_BUSY)) { + cpu_relax(); + continue; + } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* If hugepage and is trans splitting return None */ + if (unlikely(hugepage && + pmd_trans_splitting(pte_pmd(old_pte)))) + return __pte(0); +#endif + /* If pte is not present return None */ + if (unlikely(!(old_pte & _PAGE_PRESENT))) + return __pte(0); - if (pte_present(pte)) { - pte = pte_mkyoung(pte); - if (writing && pte_write(pte)) - pte = pte_mkdirty(pte); + new_pte = pte_mkyoung(old_pte); + if (writing && pte_write(old_pte)) + new_pte = pte_mkdirty(new_pte); + + if (old_pte == __cmpxchg_u64((unsigned long *)ptep, old_pte, + new_pte)) + break; } - - *p = pte; /* clears _PAGE_BUSY */ - - return pte; + return new_pte; } + /* Return HPTE cache control bits corresponding to Linux pte bits */ static inline unsigned long hpte_cache_bits(unsigned long pte_val) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 5880dfb31074..710d31317d81 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } /* if the guest wants write access, see if that is OK */ if (!writing && hpte_is_writable(r)) { + unsigned int hugepage_shift; pte_t *ptep, pte; /* @@ -683,9 +684,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ rcu_read_lock_sched(); ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL); - if (ptep && pte_present(*ptep)) { - pte = kvmppc_read_update_linux_pte(ptep, 1); + hva, &hugepage_shift); + if (ptep) { + pte = kvmppc_read_update_linux_pte(ptep, 1, + hugepage_shift); if (pte_write(pte)) write_ok = 1; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index dcf892d25a56..fc25689a9f35 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -139,20 +139,18 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, { pte_t *ptep; unsigned long ps = *pte_sizep; - unsigned int shift; + unsigned int hugepage_shift; - ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); if (!ptep) return __pte(0); - if (shift) - *pte_sizep = 1ul << shift; + if (hugepage_shift) + *pte_sizep = 1ul << hugepage_shift; else *pte_sizep = PAGE_SIZE; if (ps > *pte_sizep) return __pte(0); - if (!pte_present(*ptep)) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing); + return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); } static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) From c367714ce807faff3f0c48064cda158d5117b419 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:20 +0530 Subject: [PATCH 083/156] powerpc: Update gup_pmd_range to handle transparent hugepages Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/gup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index 4b921affa495..223a255545fc 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -66,9 +66,15 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * If we find a splitting transparent hugepage we + * return zero. That will result in taking the slow + * path which will call wait_split_huge_page() + * if the pmd is still in splitting state + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; - if (pmd_huge(pmd)) { + if (pmd_huge(pmd) || pmd_large(pmd)) { if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, write, pages, nr)) return 0; From 6d492ecc6489113968ec269be1cf88942d4a5d29 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:21 +0530 Subject: [PATCH 084/156] powerpc/THP: Add code to handle HPTE faults for hugepages The deposted PTE page in the second half of the PMD table is used to track the state on hash PTEs. After updating the HPTE, we mark the coresponding slot in the deposted PTE page valid. Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-hash64.h | 13 ++ arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/hash_utils_64.c | 21 +++- arch/powerpc/mm/hugepage-hash64.c | 172 ++++++++++++++++++++++++++ 4 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 arch/powerpc/mm/hugepage-hash64.c diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 2accc9611248..3d6fbb00d20b 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -340,6 +340,19 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap) int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, int local, int ssize, unsigned int shift, unsigned int mmu_psize); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __hash_page_thp(unsigned long ea, unsigned long access, + unsigned long vsid, pmd_t *pmdp, unsigned long trap, + int local, int ssize, unsigned int psize); +#else +static inline int __hash_page_thp(unsigned long ea, unsigned long access, + unsigned long vsid, pmd_t *pmdp, + unsigned long trap, int local, + int ssize, unsigned int psize) +{ + BUG(); +} +#endif extern void hash_failure_debug(unsigned long ea, unsigned long access, unsigned long vsid, unsigned long trap, int ssize, int psize, int lpsize, diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index ff0379cdeeca..51230ee6a407 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -32,6 +32,7 @@ ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e8434ca6efd4..7a81e866e7b1 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) goto bail; } -#ifdef CONFIG_HUGETLB_PAGE if (hugeshift) { - rc = __hash_page_huge(ea, access, vsid, ptep, trap, local, - ssize, hugeshift, psize); + if (pmd_trans_huge(*(pmd_t *)ptep)) + rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, local, ssize, psize); +#ifdef CONFIG_HUGETLB_PAGE + else + rc = __hash_page_huge(ea, access, vsid, ptep, trap, + local, ssize, hugeshift, psize); +#else + else { + /* + * if we have hugeshift, and is not transhuge with + * hugetlb disabled, something is really wrong. + */ + rc = 1; + WARN_ON(1); + } +#endif goto bail; } -#endif /* CONFIG_HUGETLB_PAGE */ #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c new file mode 100644 index 000000000000..3c22fa307b9b --- /dev/null +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -0,0 +1,172 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* + * PPC64 THP Support for hash based MMUs + */ +#include +#include + +int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, + pmd_t *pmdp, unsigned long trap, int local, int ssize, + unsigned int psize) +{ + unsigned int index, valid; + unsigned char *hpte_slot_array; + unsigned long rflags, pa, hidx; + unsigned long old_pmd, new_pmd; + int ret, lpsize = MMU_PAGE_16M; + unsigned long vpn, hash, shift, slot; + + /* + * atomically mark the linux large page PMD busy and dirty + */ + do { + old_pmd = pmd_val(*pmdp); + /* If PMD busy, retry the access */ + if (unlikely(old_pmd & _PAGE_BUSY)) + return 0; + /* If PMD permissions don't match, take page fault */ + if (unlikely(access & ~old_pmd)) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pmd |= _PAGE_DIRTY; + } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, + old_pmd, new_pmd)); + /* + * PP bits. _PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + rflags = new_pmd & _PAGE_USER; + if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) && + (new_pmd & _PAGE_DIRTY))) + rflags |= 0x1; + /* + * _PAGE_EXEC -> HW_NO_EXEC since it's inverted + */ + rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N); + +#if 0 + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } +#endif + /* + * Find the slot index details for this ea, using base page size. + */ + shift = mmu_psize_defs[psize].shift; + index = (ea & ~HPAGE_PMD_MASK) >> shift; + BUG_ON(index >= 4096); + + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + hpte_slot_array = get_hpte_slot_array(pmdp); + + valid = hpte_valid(hpte_slot_array, index); + if (valid) { + /* update the hpte bits */ + hidx = hpte_hash_index(hpte_slot_array, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + ret = ppc_md.hpte_updatepp(slot, rflags, vpn, + psize, lpsize, ssize, local); + /* + * We failed to update, try to insert a new entry. + */ + if (ret == -1) { + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + valid = 0; + new_pmd &= ~_PAGE_HPTEFLAGS; + hpte_slot_array[index] = 0; + } else + /* clear the busy bits and set the hash pte bits */ + new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + } + + if (!valid) { + unsigned long hpte_group; + + /* insert new entry */ + pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; +repeat: + hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + + /* clear the busy bits and set the hash pte bits */ + new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + + /* Add in WIMG bits */ + rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + psize, lpsize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pmd and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *pmdp = __pmd(old_pmd); + hash_failure_debug(ea, access, vsid, trap, ssize, + psize, lpsize, old_pmd); + return -1; + } + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + mark_hpte_slot_valid(hpte_slot_array, index, slot); + } + /* + * No need to use ldarx/stdcx here + */ + *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + return 0; +} From 0ac52dd7666d5c0d0147d73a8e4b1d1ffd81cdf3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:22 +0530 Subject: [PATCH 085/156] powerpc: Make linux pagetable walk safe with THP enabled We need to have irqs disabled to handle all the possible parallel update for linux page table without holding locks. Events that we are intersted in while walking page tables are 1) Page fault 2) umap 3) THP split 4) THP collapse A) local_irq_disabled: ------------------------ 1) page fault: A none to valid transition via page fault is not an issue because we would either see a none or valid. If it is none, we would error out the page table walk. We may need to use on stack values when checking for type of page table elements, because if we do if (!is_hugepd()) { if (!pmd_none() { if (pmd_bad() { We could take that bad condition because the pmd got converted to a hugepd after the !is_hugepd check via a hugetlb fault. The right way would be to check for pmd_none higher up or use on stack value. 2) A valid to none conversion via unmap: We can safely walk the upper level table, because we don't remove the the page table entries until rcu grace period. So even if we followed a wrong pointer we still have the pointer valid till the grace period. A PTE pointer returned need to be atomically checked for _PAGE_PRESENT and _PAGE_BUSY. A valid pointer returned could becoming none later. To prevent pte_clear we take _PAGE_BUSY. 3) THP split: A valid transparent hugepage is converted to nomal page. Before we split we do pmd_splitting_flush, which sets the hugepage PTE to _PAGE_SPLITTING So when walking page table we need to check for pmd_trans_splitting and handle that. The pte returned should also need to be checked for _PAGE_SPLITTING before setting _PAGE_BUSY similar to _PAGE_PRESENT. We save the value of PTE on stack and check for the flag in the local pte value. If we don't have the value set we can safely operate on the local pte value and we atomicaly set _PAGE_BUSY. 4) THP collapse: A normal page gets converted to hugepage. In the collapse path, we mark the pmd none early (pmdp_clear_flush). With irq disabled, if we are aleady walking page table we would see the pmd_none and won't continue. If we see a valid PMD, we should still check for _PAGE_PRESENT before setting _PAGE_BUSY, to make sure we didn't collapse the PTE to a Huge PTE. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_utils_64.c | 27 ++++++------ arch/powerpc/mm/hugepage-hash64.c | 3 ++ arch/powerpc/mm/hugetlbpage.c | 72 ++++++++++++++++++++----------- arch/powerpc/mm/mem.c | 4 ++ 4 files changed, 68 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7a81e866e7b1..845231643987 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1180,13 +1180,25 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pgdir = mm->pgd; if (pgdir == NULL) return; + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + if (!vsid) + return; + /* + * Hash doesn't like irqs. Walking linux page table with irq disabled + * saves us from holding multiple locks. + */ + local_irq_save(flags); + /* * THP pages use update_mmu_cache_pmd. We don't do * hash preload there. Hence can ignore THP here */ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); if (!ptep) - return; + goto out_exit; WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES @@ -1197,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * page size demotion here */ if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) - return; + goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ - /* Get VSID */ - ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); - if (!vsid) - return; - - /* Hash doesn't like irqs */ - local_irq_save(flags); - /* Is that local to this CPU ? */ if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) local = 1; @@ -1230,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, mm->context.user_psize, mm->context.user_psize, pte_val(*ptep)); - +out_exit: local_irq_restore(flags); } diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 3c22fa307b9b..34de9e0cdc34 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -37,6 +37,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, /* If PMD busy, retry the access */ if (unlikely(old_pmd & _PAGE_BUSY)) return 0; + /* If PMD is trans splitting retry the access */ + if (unlikely(old_pmd & _PAGE_SPLITTING)) + return 0; /* If PMD permissions don't match, take page fault */ if (unlikely(access & ~old_pmd)) return 1; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 8add58061003..e9e6882231da 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -925,12 +925,16 @@ void flush_dcache_icache_hugepage(struct page *page) * (2) pointer to next table, as normal; bottom 6 bits == 0 * (3) leaf pte for huge page, bottom two bits != 00 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. */ + pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) { - pgd_t *pg; - pud_t *pu; - pmd_t *pm; + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; pte_t *ret_pte; hugepd_t *hpdp = NULL; unsigned pdshift = PGDIR_SHIFT; @@ -938,34 +942,42 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift if (shift) *shift = 0; - pg = pgdir + pgd_index(ea); - + pgdp = pgdir + pgd_index(ea); + pgd = ACCESS_ONCE(*pgdp); /* - * we should first check for none. That takes care of a - * a parallel hugetlb or THP pagefault moving none entries - * to respective types. + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. */ - if (pgd_none(*pg)) + if (pgd_none(pgd)) return NULL; - else if (pgd_huge(*pg)) { - ret_pte = (pte_t *) pg; + else if (pgd_huge(pgd)) { + ret_pte = (pte_t *) pgdp; goto out; - } else if (is_hugepd(pg)) - hpdp = (hugepd_t *)pg; + } else if (is_hugepd(&pgd)) + hpdp = (hugepd_t *)&pgd; else { + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ pdshift = PUD_SHIFT; - pu = pud_offset(pg, ea); + pudp = pud_offset(&pgd, ea); + pud = ACCESS_ONCE(*pudp); - if (pud_none(*pu)) + if (pud_none(pud)) return NULL; - else if (pud_huge(*pu)) { - ret_pte = (pte_t *) pu; + else if (pud_huge(pud)) { + ret_pte = (pte_t *) pudp; goto out; - } else if (is_hugepd(pu)) - hpdp = (hugepd_t *)pu; + } else if (is_hugepd(&pud)) + hpdp = (hugepd_t *)&pud; else { pdshift = PMD_SHIFT; - pm = pmd_offset(pu, ea); + pmdp = pmd_offset(&pud, ea); + pmd = ACCESS_ONCE(*pmdp); /* * A hugepage collapse is captured by pmd_none, because * it mark the pmd none and do a hpte invalidate. @@ -975,16 +987,16 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift * hpte invalidate * */ - if (pmd_none(*pm) || pmd_trans_splitting(*pm)) + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return NULL; - if (pmd_huge(*pm) || pmd_large(*pm)) { - ret_pte = (pte_t *) pm; + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *) pmdp; goto out; - } else if (is_hugepd(pm)) - hpdp = (hugepd_t *)pm; + } else if (is_hugepd(&pmd)) + hpdp = (hugepd_t *)&pmd; else - return pte_offset_kernel(pm, ea); + return pte_offset_kernel(&pmd, ea); } } if (!hpdp) @@ -1020,6 +1032,14 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if ((pte_val(pte) & mask) != mask) return 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * check for splitting here + */ + if (pmd_trans_splitting(pte_pmd(pte))) + return 0; +#endif + /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0988a26e0413..ccd49f9503a9 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -508,6 +508,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { #ifdef CONFIG_PPC_STD_MMU + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ unsigned long access = 0, trap; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ From 7888b4ddb44dccd68bc20d0dc4425707dff88c72 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:23 +0530 Subject: [PATCH 086/156] powerpc: Prevent gcc to re-read the pagetables GCC is very likely to read the pagetables just once and cache them in the local stack or in a register, but it is can also decide to re-read the pagetables. The problem is that the pagetable in those places can change from under gcc. With THP/hugetlbfs the pmd (and pgd for hugetlbfs giga pages) can change under gup_fast. The pages won't be freed untill we finish gup fast because we have irq disabled and we free these pages via rcu callback. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/gup.c | 8 ++++---- arch/powerpc/mm/hugetlbpage.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index 223a255545fc..49822d90ea96 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, ptep = pte_offset_kernel(&pmd, addr); do { - pte_t pte = *ptep; + pte_t pte = ACCESS_ONCE(*ptep); struct page *page; if ((pte_val(pte) & mask) != result) @@ -63,7 +63,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmdp = pmd_offset(&pud, addr); do { - pmd_t pmd = *pmdp; + pmd_t pmd = ACCESS_ONCE(*pmdp); next = pmd_addr_end(addr, end); /* @@ -97,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, pudp = pud_offset(&pgd, addr); do { - pud_t pud = *pudp; + pud_t pud = ACCESS_ONCE(*pudp); next = pud_addr_end(addr, end); if (pud_none(pud)) @@ -160,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pgdp = pgd_offset(mm, addr); do { - pgd_t pgd = *pgdp; + pgd_t pgd = ACCESS_ONCE(*pgdp); pr_devel(" %016lx: normal pgd %p\n", addr, (void *)pgd_val(pgd)); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e9e6882231da..f2f01fd3ec68 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1024,7 +1024,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (pte_end < end) end = pte_end; - pte = *ptep; + pte = ACCESS_ONCE(*ptep); mask = _PAGE_PRESENT | _PAGE_USER; if (write) mask |= _PAGE_RW; From a00e7bea0dde6a44b9bbe84f30b731d9ec73858b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:24 +0530 Subject: [PATCH 087/156] powerpc: disable assert_pte_locked for collapse_huge_page With THP we set pmd to none, before we do pte_clear. Hence we can't walk page table to get the pte lock ptr and verify whether it is locked. THP do take pte lock before calling pte_clear. So we don't change the locking rules here. It is that we can't use page table walking to check whether pte locks are held with THP. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 214130a4edc6..edda589795c3 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) pud = pud_offset(pgd, addr); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); + /* + * khugepaged to collapse normal pages to hugepage, first set + * pmd to none to force page fault/gup to take mmap_sem. After + * pmd is set to none, we do a pte_clear which does this assertion + * so if we find pmd none, return. + */ + if (pmd_none(*pmd)) + return; BUG_ON(!pmd_present(*pmd)); assert_spin_locked(pte_lockptr(mm, pmd)); } From d8e355a20f9dd45deea4c33db649dda59bdbd293 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:25 +0530 Subject: [PATCH 088/156] powerpc: split hugepage when using subpage protection We find all the overlapping vma and mark them such that we don't allocate hugepage in that range. Also we split existing huge page so that the normal page hash can be invalidated and new page faulted in with new protection bits. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/subpage-prot.c | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 7c415ddde948..aa74acb0fdfc 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) up_write(&mm->mmap_sem); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + split_huge_page_pmd(vma, addr, pmd); + return 0; +} + +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct mm_walk subpage_proto_walk = { + .mm = mm, + .pmd_entry = subpage_walk_pmd_entry, + }; + + /* + * We don't try too hard, we just mark all the vma in that range + * VM_NOHUGEPAGE and split them. + */ + vma = find_vma(mm, addr); + /* + * If the range is in unmapped range, just return + */ + if (vma && ((addr + len) <= vma->vm_start)) + return; + + while (vma) { + if (vma->vm_start >= (addr + len)) + break; + vma->vm_flags |= VM_NOHUGEPAGE; + subpage_proto_walk.private = vma; + walk_page_range(vma->vm_start, vma->vm_end, + &subpage_proto_walk); + vma = vma->vm_next; + } +} +#else +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + return; +} +#endif + /* * Copy in a subpage protection map for an address range. * The map has 2 bits per 4k subpage, so 32 bits per 64k page. @@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) return -EFAULT; down_write(&mm->mmap_sem); + subpage_mark_vma_nohuge(mm, addr, len); for (limit = addr + len; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); err = -ENOMEM; From 437d496457a30ce9ccccb94b2373c201b2558392 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:26 +0530 Subject: [PATCH 089/156] powerpc/THP: Enable THP on PPC64 We enable only if the we support 16MB page size. Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc64.h | 3 +-- arch/powerpc/mm/pgtable_64.c | 29 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e71bd25d62d7..46db09414a10 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -444,8 +444,7 @@ static inline int pmd_trans_splitting(pmd_t pmd) return 0; } -/* We will enable it in the last patch */ -#define has_transparent_hugepage() 0 +extern int has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pmd_pte(pmd_t pmd) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e4d3e9fb59be..074a4a225054 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -831,4 +831,33 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm, memset(pgtable, 0, PTE_FRAG_SIZE); return old_pmd; } + +int has_transparent_hugepage(void) +{ + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ From 1a5272866f87d7fbf04dc8060f8da3e8456490ab Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:27 +0530 Subject: [PATCH 090/156] powerpc: Optimize hugepage invalidate Hugepage invalidate involves invalidating multiple hpte entries. Optimize the operation using H_BULK_REMOVE on lpar platforms. On native, reduce the number of tlb flush. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/machdep.h | 3 + arch/powerpc/mm/hash_native_64.c | 73 +++++++++++++++ arch/powerpc/mm/pgtable_64.c | 12 ++- arch/powerpc/platforms/pseries/lpar.c | 122 ++++++++++++++++++++++++-- 4 files changed, 201 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 801e3c61395f..8b480901165a 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -57,6 +57,9 @@ struct machdep_calls { void (*hpte_removebolted)(unsigned long ea, int psize, int ssize); void (*flush_hash_range)(unsigned long number, int local); + void (*hugepage_invalidate)(struct mm_struct *mm, + unsigned char *hpte_slot_array, + unsigned long addr, int psize); /* special for kexec, to be called in real mode, linear mapping is * destroyed as well */ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 6d152bc993e5..3f0c30ae4791 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -407,6 +407,78 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, local_irq_restore(flags); } +static void native_hugepage_invalidate(struct mm_struct *mm, + unsigned char *hpte_slot_array, + unsigned long addr, int psize) +{ + int ssize = 0, i; + int lock_tlbie; + struct hash_pte *hptep; + int actual_psize = MMU_PAGE_16M; + unsigned int max_hpte_count, valid; + unsigned long flags, s_addr = addr; + unsigned long hpte_v, want_v, shift; + unsigned long hidx, vpn = 0, vsid, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + local_irq_save(flags); + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + native_lock_hpte(hptep); + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + } + /* + * Since this is a hugepage, we just need a single tlbie. + * use the last vpn. + */ + lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + __tlbie(vpn, psize, actual_psize, ssize); + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + + local_irq_restore(flags); +} + static inline int __hpte_actual_psize(unsigned int lp, int psize) { int i, shift; @@ -640,4 +712,5 @@ void __init hpte_init_native(void) ppc_md.hpte_remove = native_hpte_remove; ppc_md.hpte_clear_all = native_hpte_clear; ppc_md.flush_hash_range = native_flush_hash_range; + ppc_md.hugepage_invalidate = native_hugepage_invalidate; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 074a4a225054..536eec72c0f7 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -708,6 +708,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, { int ssize, i; unsigned long s_addr; + int max_hpte_count; unsigned int psize, valid; unsigned char *hpte_slot_array; unsigned long hidx, vpn, vsid, hash, shift, slot; @@ -727,9 +728,16 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, /* get the base page size */ psize = get_slice_psize(mm, s_addr); - shift = mmu_psize_defs[psize].shift; - for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) { + if (ppc_md.hugepage_invalidate) + return ppc_md.hugepage_invalidate(mm, hpte_slot_array, + s_addr, psize); + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { /* * 8 bits per each hpte entries * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index ca45c8f3c0ba..fd0f2f2a9b90 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -45,6 +45,13 @@ #include "plpar_wrappers.h" #include "pseries.h" +/* Flag bits for H_BULK_REMOVE */ +#define HBR_REQUEST 0x4000000000000000UL +#define HBR_RESPONSE 0x8000000000000000UL +#define HBR_END 0xc000000000000000UL +#define HBR_AVPN 0x0200000000000000UL +#define HBR_ANDCOND 0x0100000000000000UL + /* in hvCall.S */ EXPORT_SYMBOL(plpar_hcall); @@ -347,6 +354,113 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, BUG_ON(lpar_rc != H_SUCCESS); } +/* + * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need + * to make sure that we avoid bouncing the hypervisor tlbie lock. + */ +#define PPC64_HUGE_HPTE_BATCH 12 + +static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, + unsigned long *vpn, int count, + int psize, int ssize) +{ + unsigned long param[8]; + int i = 0, pix = 0, rc; + unsigned long flags = 0; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + for (i = 0; i < count; i++) { + + if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { + pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0, + ssize, 0); + } else { + param[pix] = HBR_REQUEST | HBR_AVPN | slot[i]; + param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize); + pix += 2; + if (pix == 8) { + rc = plpar_hcall9(H_BULK_REMOVE, param, + param[0], param[1], param[2], + param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + pix = 0; + } + } + } + if (pix) { + param[pix] = HBR_END; + rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1], + param[2], param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + } + + if (lock_tlbie) + spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); +} + +static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm, + unsigned char *hpte_slot_array, + unsigned long addr, int psize) +{ + int ssize = 0, i, index = 0; + unsigned long s_addr = addr; + unsigned int max_hpte_count, valid; + unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH]; + unsigned long slot_array[PPC64_HUGE_HPTE_BATCH]; + unsigned long shift, hidx, vpn = 0, vsid, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + slot_array[index] = slot; + vpn_array[index] = vpn; + if (index == PPC64_HUGE_HPTE_BATCH - 1) { + /* + * Now do a bluk invalidate + */ + __pSeries_lpar_hugepage_invalidate(slot_array, + vpn_array, + PPC64_HUGE_HPTE_BATCH, + psize, ssize); + index = 0; + } else + index++; + } + if (index) + __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array, + index, psize, ssize); +} + static void pSeries_lpar_hpte_removebolted(unsigned long ea, int psize, int ssize) { @@ -364,13 +478,6 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea, pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0); } -/* Flag bits for H_BULK_REMOVE */ -#define HBR_REQUEST 0x4000000000000000UL -#define HBR_RESPONSE 0x8000000000000000UL -#define HBR_END 0xc000000000000000UL -#define HBR_AVPN 0x0200000000000000UL -#define HBR_ANDCOND 0x0100000000000000UL - /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. @@ -459,6 +566,7 @@ void __init hpte_init_lpar(void) ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted; ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range; ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear; + ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; } #ifdef CONFIG_PPC_SMLPAR From 0875a88e8569c2ca306f321018238bd1a3d7fd86 Mon Sep 17 00:00:00 2001 From: Matteo Facchinetti Date: Mon, 10 Jun 2013 10:13:52 +0200 Subject: [PATCH 091/156] powerpc/mpc512x: add MPC5125 reset module support for system restart Only part of MPC5125 reset module is like as MPC5121. In detail, RCWH register doesn't contain informations about: - PCI arbiter - NAND flash page size - NAND flash port size For this reason, in device tree, this module has a different name then MPC5121 reset module but use the same "struct mpc512x_reset_module" register definition and the same restart procedure. Signed-off-by: Matteo Facchinetti Signed-off-by: Anatolij Gustschin --- arch/powerpc/platforms/512x/mpc512x.h | 1 + arch/powerpc/platforms/512x/mpc512x_shared.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h index fdb4303246a0..cc97f022d028 100644 --- a/arch/powerpc/platforms/512x/mpc512x.h +++ b/arch/powerpc/platforms/512x/mpc512x.h @@ -17,6 +17,7 @@ extern void __init mpc512x_init(void); extern void __init mpc512x_setup_arch(void); extern int __init mpc5121_clk_init(void); extern const char *mpc512x_select_psc_compat(void); +extern const char *mpc512x_select_reset_compat(void); extern void mpc512x_restart(char *cmd); #endif /* __MPC512X_H__ */ diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index a8b5110eb298..a82a41b4fd91 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -35,8 +35,10 @@ static struct mpc512x_reset_module __iomem *reset_module_base; static void __init mpc512x_restart_init(void) { struct device_node *np; + const char *reset_compat; - np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-reset"); + reset_compat = mpc512x_select_reset_compat(); + np = of_find_compatible_node(NULL, NULL, reset_compat); if (!np) return; @@ -355,6 +357,17 @@ const char *mpc512x_select_psc_compat(void) return NULL; } +const char *mpc512x_select_reset_compat(void) +{ + if (of_machine_is_compatible("fsl,mpc5121")) + return "fsl,mpc5121-reset"; + + if (of_machine_is_compatible("fsl,mpc5125")) + return "fsl,mpc5125-reset"; + + return NULL; +} + static unsigned int __init get_fifo_size(struct device_node *np, char *prop_name) { From 8c43d2b0ca10cea9eb62d8996fb93f330be88ada Mon Sep 17 00:00:00 2001 From: Joe Liccese Date: Mon, 25 Jun 2012 12:22:09 -0500 Subject: [PATCH 092/156] powerpc: Add T4 LAC device tree binding & defs The Interlaken is a narrow, high speed channelized chip-to-chip interface. To facilitate interoperability between a data path device and a look-aside co-processor, the Interlaken Look-Aside protocol is defined for short transaction-related transfers. Although based on the Interlaken protocol, Interlaken Look-Aside is not directly compatible with Interlaken and can be considered a different operation mode. The Interlaken LA controller connects internal platform to Interlaken serial interface. It accepts LA command through software portals, which are system memory mapped 4KB spaces. The LA commands are then translated into the Interlaken control words and data words, which are sent on TX side to TCAM through SerDes lanes. Signed-off-by: Joe Liccese Signed-off-by: Scott Wood --- .../bindings/powerpc/fsl/interlaken-lac.txt | 309 ++++++++++++++++++ .../boot/dts/fsl/interlaken-lac-portals.dtsi | 156 +++++++++ arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi | 45 +++ 3 files changed, 510 insertions(+) create mode 100644 Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt create mode 100644 arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi create mode 100644 arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi diff --git a/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt new file mode 100644 index 000000000000..641bc13983e1 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt @@ -0,0 +1,309 @@ +=============================================================================== +Freescale Interlaken Look-Aside Controller Device Bindings +Copyright 2012 Freescale Semiconductor Inc. + +CONTENTS + - Interlaken Look-Aside Controller (LAC) Node + - Example LAC Node + - Interlaken Look-Aside Controller (LAC) Software Portal Node + - Interlaken Look-Aside Controller (LAC) Software Portal Child Nodes + - Example LAC SWP Node with Child Nodes + +============================================================================== +Interlaken Look-Aside Controller (LAC) Node + +DESCRIPTION + +The Interlaken is a narrow, high speed channelized chip-to-chip interface. To +facilitate interoperability between a data path device and a look-aside +co-processor, the Interlaken Look-Aside protocol is defined for short +transaction-related transfers. Although based on the Interlaken protocol, +Interlaken Look-Aside is not directly compatible with Interlaken and can be +considered a different operation mode. + +The Interlaken LA controller connects internal platform to Interlaken serial +interface. It accepts LA command through software portals, which are system +memory mapped 4KB spaces. The LA commands are then translated into the +Interlaken control words and data words, which are sent on TX side to TCAM +through SerDes lanes. + +There are two 4KiB spaces defined within the LAC global register memory map. +There is a full register set at 0x0000-0x0FFF (also known as the "hypervisor" +version), and a subset at 0x1000-0x1FFF. The former is a superset of the +latter, and includes certain registers that should not be accessible to +partitioned software. Separate nodes are used for each region, with a phandle +linking the hypervisor node to the normal operating node. + +PROPERTIES + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,interlaken-lac". This represents only + those LAC CCSR registers not protected in partitioned + software. The version of the device is determined by the LAC + IP Block Revision Register (IPBRR0) at offset 0x0BF8. + + Table of correspondences between IPBRR0 values and example + chips: + Value Device + ----------- ------- + 0x02000100 T4240 + + The Hypervisor node has a different compatible. It must include + "fsl,interlaken-lac-hv". This node represents the protected + LAC register space and is required except inside a partition + where access to the hypervisor node is to be denied. + + - fsl,non-hv-node + Usage: required in "fsl,interlaken-lac-hv" + Value type: + Definition: Points to the non-protected LAC CCSR mapped register space + node. + + - reg + Usage: required + Value type: + Definition: A standard property. The first resource represents the + Interlaken LAC configuration registers. + + - interrupts: + Usage: required in non-hv node only + Value type: + Definition: Interrupt mapping for Interlaken LAC error IRQ. + +EXAMPLE + lac: lac@229000 { + compatible = "fsl,interlaken-lac" + reg = <0x229000 0x1000>; + interrupts = <16 2 1 18>; + }; + + lac-hv@228000 { + compatible = "fsl,interlaken-lac-hv" + reg = <0x228000 0x1000>; + fsl,non-hv-node = <&lac>; + }; + +=============================================================================== +Interlaken Look-Aside Controller (LAC) Software Portal Container Node + +DESCRIPTION +The Interlaken Look-Aside Controller (LAC) utilizes Software Portals to accept +Interlaken Look-Aside (ILA) commands. The Interlaken LAC software portal +memory map occupies 128KB of memory space. The software portal memory space is +intended to be cache-enabled. WIMG for each software space is required to be +0010 if stashing is enabled; otherwise, WIMG can be 0000 or 0010. + +PROPERTIES + + - #address-cells + Usage: required + Value type: + Definition: A standard property. Must have a value of 1. + + - #size-cells + Usage: required + Value type: + Definition: A standard property. Must have a value of 1. + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,interlaken-lac-portals" + + - ranges + Usage: required + Value type: + Definition: A standard property. Specifies the address and length + of the LAC portal memory space. + +=============================================================================== +Interlaken Look-Aside Controller (LAC) Software Portals Child Nodes + +DESCRIPTION +There are up to 24 available software portals with each software portal +requiring 4KB of consecutive memory within the software portal memory mapped +space. + +PROPERTIES + + - compatible + Usage: required + Value type: + Definition: Must include "fsl,interlaken-lac-portal-vX.Y" where X is + the Major version (IP_MJ) found in the LAC IP Block Revision + Register (IPBRR0), at offset 0x0BF8, and Y is the Minor version + (IP_MN). + + Table of correspondences between version values and example chips: + Value Device + ------ ------- + 1.0 T4240 + + - reg + Usage: required + Value type: + Definition: A standard property. The first resource represents the + Interlaken LAC software portal registers. + + - fsl,liodn + Value type: + Definition: The logical I/O device number (LIODN) for this device. The + LIODN is a number expressed by this device and used to perform + look-ups in the IOMMU (PAMU) address table when performing + DMAs. This property is automatically added by u-boot. + +=============================================================================== +EXAMPLE + +lac-portals { + #address-cells = <0x1>; + #size-cells = <0x1>; + compatible = "fsl,interlaken-lac-portals"; + ranges = <0x0 0xf 0xf4400000 0x20000>; + + lportal0: lac-portal@0 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x204>; + reg = <0x0 0x1000>; + }; + + lportal1: lac-portal@1000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x205>; + reg = <0x1000 0x1000>; + }; + + lportal2: lac-portal@2000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x206>; + reg = <0x2000 0x1000>; + }; + + lportal3: lac-portal@3000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x207>; + reg = <0x3000 0x1000>; + }; + + lportal4: lac-portal@4000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x208>; + reg = <0x4000 0x1000>; + }; + + lportal5: lac-portal@5000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x209>; + reg = <0x5000 0x1000>; + }; + + lportal6: lac-portal@6000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20A>; + reg = <0x6000 0x1000>; + }; + + lportal7: lac-portal@7000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20B>; + reg = <0x7000 0x1000>; + }; + + lportal8: lac-portal@8000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20C>; + reg = <0x8000 0x1000>; + }; + + lportal9: lac-portal@9000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20D>; + reg = <0x9000 0x1000>; + }; + + lportal10: lac-portal@A000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20E>; + reg = <0xA000 0x1000>; + }; + + lportal11: lac-portal@B000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x20F>; + reg = <0xB000 0x1000>; + }; + + lportal12: lac-portal@C000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x210>; + reg = <0xC000 0x1000>; + }; + + lportal13: lac-portal@D000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x211>; + reg = <0xD000 0x1000>; + }; + + lportal14: lac-portal@E000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x212>; + reg = <0xE000 0x1000>; + }; + + lportal15: lac-portal@F000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x213>; + reg = <0xF000 0x1000>; + }; + + lportal16: lac-portal@10000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x214>; + reg = <0x10000 0x1000>; + }; + + lportal17: lac-portal@11000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x215>; + reg = <0x11000 0x1000>; + }; + + lportal8: lac-portal@1200 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x216>; + reg = <0x12000 0x1000>; + }; + + lportal19: lac-portal@13000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x217>; + reg = <0x13000 0x1000>; + }; + + lportal20: lac-portal@14000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x218>; + reg = <0x14000 0x1000>; + }; + + lportal21: lac-portal@15000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x219>; + reg = <0x15000 0x1000>; + }; + + lportal22: lac-portal@16000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x21A>; + reg = <0x16000 0x1000>; + }; + + lportal23: lac-portal@17000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + fsl,liodn = <0x21B>; + reg = <0x17000 0x1000>; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi new file mode 100644 index 000000000000..9cffccf4e07e --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi @@ -0,0 +1,156 @@ +/* T4240 Interlaken LAC Portal device tree stub with 24 portals. + * + * Copyright 2012 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#address-cells = <0x1>; +#size-cells = <0x1>; +compatible = "fsl,interlaken-lac-portals"; + +lportal0: lac-portal@0 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x0 0x1000>; +}; + +lportal1: lac-portal@1000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x1000 0x1000>; +}; + +lportal2: lac-portal@2000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x2000 0x1000>; +}; + +lportal3: lac-portal@3000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x3000 0x1000>; +}; + +lportal4: lac-portal@4000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x4000 0x1000>; +}; + +lportal5: lac-portal@5000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x5000 0x1000>; +}; + +lportal6: lac-portal@6000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x6000 0x1000>; +}; + +lportal7: lac-portal@7000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x7000 0x1000>; +}; + +lportal8: lac-portal@8000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x8000 0x1000>; +}; + +lportal9: lac-portal@9000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x9000 0x1000>; +}; + +lportal10: lac-portal@A000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xA000 0x1000>; +}; + +lportal11: lac-portal@B000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xB000 0x1000>; +}; + +lportal12: lac-portal@C000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xC000 0x1000>; +}; + +lportal13: lac-portal@D000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xD000 0x1000>; +}; + +lportal14: lac-portal@E000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xE000 0x1000>; +}; + +lportal15: lac-portal@F000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0xF000 0x1000>; +}; + +lportal16: lac-portal@10000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x10000 0x1000>; +}; + +lportal17: lac-portal@11000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x11000 0x1000>; +}; + +lportal18: lac-portal@1200 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x12000 0x1000>; +}; + +lportal19: lac-portal@13000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x13000 0x1000>; +}; + +lportal20: lac-portal@14000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x14000 0x1000>; +}; + +lportal21: lac-portal@15000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x15000 0x1000>; +}; + +lportal22: lac-portal@16000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x16000 0x1000>; +}; + +lportal23: lac-portal@17000 { + compatible = "fsl,interlaken-lac-portal-v1.0"; + reg = <0x17000 0x1000>; +}; diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi new file mode 100644 index 000000000000..e8208720ac0e --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi @@ -0,0 +1,45 @@ +/* + * T4 Interlaken Look-aside Controller (LAC) device tree stub + * + * Copyright 2012 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +lac: lac@229000 { + compatible = "fsl,interlaken-lac"; + reg = <0x229000 0x1000>; + interrupts = <16 2 1 18>; +}; + +lac-hv@228000 { + compatible = "fsl,interlaken-lac-hv"; + reg = <0x228000 0x1000>; + fsl,non-hv-node = <&lac>; +}; From 6d18c9042c037f8600c9648260f76eae5170c0c3 Mon Sep 17 00:00:00 2001 From: Gerhard Sittig Date: Tue, 18 Jun 2013 13:17:21 +0200 Subject: [PATCH 093/156] powerpc/mpc512x: commit re-generated defconfig This patch does not change the content, it merely re-orders configuration items and drops explicit options which already apply as the default. Signed-off-by: Gerhard Sittig Signed-off-by: Anatolij Gustschin --- arch/powerpc/configs/mpc512x_defconfig | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig index 0d0d981442fd..5b8ee8080ecc 100644 --- a/arch/powerpc/configs/mpc512x_defconfig +++ b/arch/powerpc/configs/mpc512x_defconfig @@ -1,7 +1,6 @@ -CONFIG_EXPERIMENTAL=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y -CONFIG_SPARSE_IRQ=y +CONFIG_NO_HZ=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set @@ -9,6 +8,7 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_CFQ is not set # CONFIG_PPC_CHRP is not set CONFIG_PPC_MPC512x=y @@ -16,9 +16,7 @@ CONFIG_MPC5121_ADS=y CONFIG_MPC512x_GENERIC=y CONFIG_PDM360NG=y # CONFIG_PPC_PMAC is not set -CONFIG_NO_HZ=y CONFIG_HZ_1000=y -# CONFIG_MIGRATION is not set # CONFIG_SECCOMP is not set # CONFIG_PCI is not set CONFIG_NET=y @@ -33,8 +31,6 @@ CONFIG_IP_PNP=y # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set CONFIG_CAN=y -CONFIG_CAN_RAW=y -CONFIG_CAN_BCM=y CONFIG_CAN_VCAN=y CONFIG_CAN_MSCAN=y CONFIG_CAN_DEBUG_DEVICES=y @@ -46,7 +42,6 @@ CONFIG_DEVTMPFS_MOUNT=y # CONFIG_FIRMWARE_IN_KERNEL is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y @@ -60,7 +55,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=1 CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_XIP=y -CONFIG_MISC_DEVICES=y CONFIG_EEPROM_AT24=y CONFIG_EEPROM_AT25=y CONFIG_SCSI=y @@ -68,6 +62,7 @@ CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_SG=y CONFIG_NETDEVICES=y +CONFIG_FS_ENET=y CONFIG_MARVELL_PHY=y CONFIG_DAVICOM_PHY=y CONFIG_QSEMI_PHY=y @@ -83,10 +78,6 @@ CONFIG_STE10XP=y CONFIG_LSI_ET1011C_PHY=y CONFIG_FIXED_PHY=y CONFIG_MDIO_BITBANG=y -CONFIG_NET_ETHERNET=y -CONFIG_FS_ENET=y -# CONFIG_NETDEV_1000 is not set -# CONFIG_NETDEV_10000 is not set # CONFIG_WLAN is not set # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y @@ -106,10 +97,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_GPIO_MPC8XXX=y # CONFIG_HWMON is not set CONFIG_MEDIA_SUPPORT=y -CONFIG_VIDEO_DEV=y CONFIG_VIDEO_ADV_DEBUG=y -# CONFIG_VIDEO_HELPER_CHIPS_AUTO is not set -CONFIG_VIDEO_SAA711X=y CONFIG_FB=y CONFIG_FB_FSL_DIU=y # CONFIG_VGA_CONSOLE is not set @@ -129,9 +117,7 @@ CONFIG_TMPFS=y CONFIG_JFFS2_FS=y CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y -CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y -CONFIG_PARTITION_ADVANCED=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set From dd0120dea601de654a5a6d959da3a632a02200f0 Mon Sep 17 00:00:00 2001 From: Anatolij Gustschin Date: Tue, 25 Jun 2013 08:51:19 +0200 Subject: [PATCH 094/156] powerpc/mpc512x: enable USB support in defconfig Enable USB EHCI, mass storage and USB gadget support. Signed-off-by: Anatolij Gustschin --- arch/powerpc/configs/mpc512x_defconfig | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig index 5b8ee8080ecc..ee853a1b1b2c 100644 --- a/arch/powerpc/configs/mpc512x_defconfig +++ b/arch/powerpc/configs/mpc512x_defconfig @@ -102,6 +102,13 @@ CONFIG_FB=y CONFIG_FB_FSL_DIU=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_FSL=y +# CONFIG_USB_EHCI_HCD_PPC_OF is not set +CONFIG_USB_STORAGE=y +CONFIG_USB_GADGET=y +CONFIG_USB_FSL_USB2=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_M41T80=y CONFIG_RTC_DRV_MPC5121=y From e7f345a2a36754e43673e86870a7a89101fb13e3 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 22 Jun 2013 04:29:48 -0400 Subject: [PATCH 095/156] macintosh/adb: Replace __WAITQUEUE_INITIALIZER with more standard DECLARE_WAITQUEUE. Signed-off-by: Robert P. J. Day Signed-off-by: Benjamin Herrenschmidt --- drivers/macintosh/adb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index b026896206ca..04a50498f257 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -697,7 +697,7 @@ static ssize_t adb_read(struct file *file, char __user *buf, int ret = 0; struct adbdev_state *state = file->private_data; struct adb_request *req; - wait_queue_t wait = __WAITQUEUE_INITIALIZER(wait,current); + DECLARE_WAITQUEUE(wait,current); unsigned long flags; if (count < 2) From b0b0aa9c7faf94e92320eabd8a1786c7747e40a8 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 24 Jun 2013 15:47:22 +1000 Subject: [PATCH 096/156] powerpc/hw_brk: Fix setting of length for exact mode breakpoints The smallest match region for both the DABR and DAWR is 8 bytes, so the kernel needs to filter matches when users want to look at regions smaller than this. Currently we set the length of PPC_BREAKPOINT_MODE_EXACT breakpoints to 8. This is wrong as in exact mode we should only match on 1 address, hence the length should be 1. This ensures that the kernel will filter out any exact mode hardware breakpoint matches on any addresses other than the requested one. Signed-off-by: Michael Neuling Reported-by: Edjunior Barbosa Machado Cc: stable@vger.kernel.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/ptrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 98c2fc198712..64f7bd5b1b0f 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1449,7 +1449,9 @@ static long ppc_set_hwdebug(struct task_struct *child, */ if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { len = bp_info->addr2 - bp_info->addr; - } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { + } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + len = 1; + else { ptrace_put_breakpoints(child); return -EINVAL; } From 540e07c67efe42ef6b6be4f1956931e676d58a15 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 24 Jun 2013 15:47:23 +1000 Subject: [PATCH 097/156] powerpc/hw_brk: Fix clearing of extraneous IRQ In 9422de3 "powerpc: Hardware breakpoints rewrite to handle non DABR breakpoint registers" we changed the way we mark extraneous irqs with this: - info->extraneous_interrupt = !((bp->attr.bp_addr <= dar) && - (dar - bp->attr.bp_addr < bp->attr.bp_len)); + if (!((bp->attr.bp_addr <= dar) && + (dar - bp->attr.bp_addr < bp->attr.bp_len))) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; Unfortunately this is bogus as it never clears extraneous IRQ if it's already set. This correctly clears extraneous IRQ before possibly setting it. Signed-off-by: Michael Neuling Reported-by: Edjunior Barbosa Machado Cc: stable@vger.kernel.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/hw_breakpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index a949bdfc9623..1150ae7c22c3 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -250,6 +250,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) * we still need to single-step the instruction, but we don't * generate an event. */ + info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; if (!((bp->attr.bp_addr <= dar) && (dar - bp->attr.bp_addr < bp->attr.bp_len))) info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; From 99b308e3bbacc26509858ac78ebc7a0a1dfbb888 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Mon, 24 Jun 2013 12:23:00 +0530 Subject: [PATCH 098/156] powerpc/pseries: Enable PSTORE in pseries_defconfig Since now we have pstore support for nvram in pseries, enable it in the default config. With this config option enabled, pstore infra-structure will be used to read/write the messages from/to nvram. Signed-off-by: Aruna Balakrishnaiah Acked-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/configs/pseries_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index c4dfbaf8b192..bea8587c3af5 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -296,6 +296,7 @@ CONFIG_SQUASHFS=m CONFIG_SQUASHFS_XATTR=y CONFIG_SQUASHFS_LZO=y CONFIG_SQUASHFS_XZ=y +CONFIG_PSTORE=y CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y From ff1e7683418798e44eb8af1ab9f28dd475af6034 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 24 Jun 2013 09:35:55 -0500 Subject: [PATCH 099/156] powerpc/mm: Fix build warnings with CONFIG_TRANSPARENT_HUGEPAGE disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with CONFIG_TRANSPARENT_HUGEPAGE disabled causes the following build wearnings; powerpc/arch/powerpc/include/asm/mmu-hash64.h: In function ‘__hash_page_thp’: powerpc/arch/powerpc/include/asm/mmu-hash64.h:354: warning: no return statement in function returning non-void This patch adds a return -1 to the static inline for __hash_page_thp() to correct the warnings. Signed-off-by: Nathan Fontenot Reviewed-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-hash64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 3d6fbb00d20b..c4cf01197273 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -351,6 +351,7 @@ static inline int __hash_page_thp(unsigned long ea, unsigned long access, int ssize, unsigned int psize) { BUG(); + return -1; } #endif extern void hash_failure_debug(unsigned long ea, unsigned long access, From ef6a28577398df2853abf123cb4a2e0c57eb879a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 25 Jun 2013 14:35:27 +0800 Subject: [PATCH 100/156] powerpc/eeh: Remove eeh_mutex Originally, eeh_mutex was introduced to protect the PE hierarchy tree and the attached EEH devices because EEH core was possiblly running with multiple threads to access the PE hierarchy tree. However, we now have only one kthread in EEH core. So we needn't the eeh_mutex and just remove it. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 14 -------------- arch/powerpc/kernel/eeh.c | 3 --- arch/powerpc/kernel/eeh_pe.c | 30 +----------------------------- 3 files changed, 1 insertion(+), 46 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index a0b11fb3237e..dd65e311c93a 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -151,7 +151,6 @@ struct eeh_ops { extern struct eeh_ops *eeh_ops; extern int eeh_subsystem_enabled; -extern struct mutex eeh_mutex; extern raw_spinlock_t confirm_error_lock; extern int eeh_probe_mode; @@ -173,16 +172,6 @@ static inline int eeh_probe_mode_dev(void) return (eeh_probe_mode == EEH_PROBE_MODE_DEV); } -static inline void eeh_lock(void) -{ - mutex_lock(&eeh_mutex); -} - -static inline void eeh_unlock(void) -{ - mutex_unlock(&eeh_mutex); -} - static inline void eeh_serialize_lock(unsigned long *flags) { raw_spin_lock_irqsave(&confirm_error_lock, *flags); @@ -271,9 +260,6 @@ static inline void eeh_add_sysfs_files(struct pci_bus *bus) { } static inline void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { } -static inline void eeh_lock(void) { } -static inline void eeh_unlock(void) { } - #define EEH_POSSIBLE_ERROR(val, type) (0) #define EEH_IO_ERROR_VALUE(size) (-1UL) #endif /* CONFIG_EEH */ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index af2b9ae07df5..e5796524631c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -103,9 +103,6 @@ EXPORT_SYMBOL(eeh_subsystem_enabled); */ int eeh_probe_mode; -/* Global EEH mutex */ -DEFINE_MUTEX(eeh_mutex); - /* Lock to avoid races due to multiple reports of an error */ DEFINE_RAW_SPINLOCK(confirm_error_lock); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index ae75722c0583..55943fcdaeb8 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -78,9 +78,7 @@ int eeh_phb_pe_create(struct pci_controller *phb) } /* Put it into the list */ - eeh_lock(); list_add_tail(&pe->child, &eeh_phb_pe); - eeh_unlock(); pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number); @@ -185,21 +183,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root, return NULL; } - eeh_lock(); - /* Traverse root PE */ for (pe = root; pe; pe = eeh_pe_next(pe, root)) { eeh_pe_for_each_dev(pe, edev) { ret = fn(edev, flag); - if (ret) { - eeh_unlock(); + if (ret) return ret; - } } } - eeh_unlock(); - return NULL; } @@ -305,8 +297,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent; - eeh_lock(); - /* * Search the PE has been existing or not according * to the PE address. If that has been existing, the @@ -316,7 +306,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pe = eeh_pe_get(edev); if (pe && !(pe->type & EEH_PE_INVALID)) { if (!edev->pe_config_addr) { - eeh_unlock(); pr_err("%s: PE with addr 0x%x already exists\n", __func__, edev->config_addr); return -EEXIST; @@ -328,7 +317,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Put the edev to PE */ list_add_tail(&edev->list, &pe->edevs); - eeh_unlock(); pr_debug("EEH: Add %s to Bus PE#%x\n", edev->dn->full_name, pe->addr); @@ -347,7 +335,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) parent->type &= ~EEH_PE_INVALID; parent = parent->parent; } - eeh_unlock(); pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", edev->dn->full_name, pe->addr, pe->parent->addr); @@ -357,7 +344,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Create a new EEH PE */ pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); if (!pe) { - eeh_unlock(); pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } @@ -385,7 +371,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) if (!parent) { parent = eeh_phb_pe_get(edev->phb); if (!parent) { - eeh_unlock(); pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", __func__, edev->phb->global_number); edev->pe = NULL; @@ -402,7 +387,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) list_add_tail(&pe->child, &parent->child_list); list_add_tail(&edev->list, &pe->edevs); edev->pe = pe; - eeh_unlock(); pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", edev->dn->full_name, pe->addr, pe->parent->addr); @@ -430,8 +414,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe) return -EEXIST; } - eeh_lock(); - /* Remove the EEH device */ pe = edev->pe; edev->pe = NULL; @@ -476,8 +458,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe) pe = parent; } - eeh_unlock(); - return 0; } @@ -550,9 +530,7 @@ static void *__eeh_pe_state_mark(void *data, void *flag) */ void eeh_pe_state_mark(struct eeh_pe *pe, int state) { - eeh_lock(); eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); - eeh_unlock(); } /** @@ -586,9 +564,7 @@ static void *__eeh_pe_state_clear(void *data, void *flag) */ void eeh_pe_state_clear(struct eeh_pe *pe, int state) { - eeh_lock(); eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); - eeh_unlock(); } /** @@ -673,8 +649,6 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) struct eeh_dev *edev; struct pci_dev *pdev; - eeh_lock(); - if (pe->type & EEH_PE_PHB) { bus = pe->phb->bus; } else if (pe->type & EEH_PE_BUS || @@ -691,7 +665,5 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) } out: - eeh_unlock(); - return bus; } From 5459ae1431f5d22ab10fa8b56fb16c018289fdfc Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 25 Jun 2013 14:35:28 +0800 Subject: [PATCH 101/156] powerpc/eeh: Use interruptible sleep in keehd To replace down() with down_interrutible() to avoid following warning: [c00000007ba7b710] [c000000000014410] .__switch_to+0x1b0/0x380 [c00000007ba7b7c0] [c0000000007b408c] .__schedule+0x3ec/0x970 [c00000007ba7ba50] [c0000000007b1f24] .schedule_timeout+0x1a4/0x2b0 [c00000007ba7bb30] [c0000000007b34a4] .__down+0xa4/0x104 [c00000007ba7bbf0] [c0000000000b9230] .down+0x60/0x70 [c00000007ba7bc80] [c0000000000336d0] .eeh_event_handler+0x70/0x190 [c00000007ba7bd30] [c0000000000b1a58] .kthread+0xe8/0xf0 [c00000007ba7be30] [c00000000000a05c] .ret_from_kernel_thread+0x5c/0x8 This also avoids keeping the load average up while doing nothing. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 39bcd81e7f5d..d27c5afc90ae 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -55,7 +55,8 @@ static int eeh_event_handler(void * dummy) struct eeh_pe *pe; while (!kthread_should_stop()) { - down(&eeh_eventlist_sem); + if (down_interruptible(&eeh_eventlist_sem)) + break; /* Fetch EEH event from the queue */ spin_lock_irqsave(&eeh_eventlist_lock, flags); From 98c7355fb373d7c29e5c45d0a423810ad2476b34 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Oct 2012 20:39:52 +0800 Subject: [PATCH 102/156] powerpc/83xx: use module_i2c_driver to simplify the code Use the module_i2c_driver() macro to make the code smaller and a bit simpler. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: Scott Wood --- arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c index 624cb51d19c9..7bc315822935 100644 --- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c +++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c @@ -231,17 +231,7 @@ static struct i2c_driver mcu_driver = { .id_table = mcu_ids, }; -static int __init mcu_init(void) -{ - return i2c_add_driver(&mcu_driver); -} -module_init(mcu_init); - -static void __exit mcu_exit(void) -{ - i2c_del_driver(&mcu_driver); -} -module_exit(mcu_exit); +module_i2c_driver(mcu_driver); MODULE_DESCRIPTION("Power Management and GPIO expander driver for " "MPC8349E-mITX-compatible MCU"); From a16d8aa4726a944ffc1616689ae34ff6a902faba Mon Sep 17 00:00:00 2001 From: Sachin Surendran Date: Mon, 26 Nov 2012 11:20:01 +1300 Subject: [PATCH 103/156] i2c-cpm: Fix to takeback i2c bus master-ship after a collision In case of collision on i2c bus the controller which lost bus mastership stays as a slave for all subsequent transfers. This results in the i2c controller never writing to the bus for future transactions, resulting in i2c transfer timeouts. This fix checks for a collision on last I2C transaction and sets the I2COM_MASTER bit for the new transaction. Signed-off-by: Sachin Surendran Signed-off-by: Scott Wood --- drivers/i2c/busses/i2c-cpm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c index 3823623baa48..9e6002108720 100644 --- a/drivers/i2c/busses/i2c-cpm.c +++ b/drivers/i2c/busses/i2c-cpm.c @@ -338,6 +338,14 @@ static int cpm_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) tptr = 0; rptr = 0; + /* + * If there was a collision in the last i2c transaction, + * Set I2COM_MASTER as it was cleared during collision. + */ + if (in_be16(&tbdf->cbd_sc) & BD_SC_CL) { + out_8(&cpm->i2c_reg->i2com, I2COM_MASTER); + } + while (tptr < num) { pmsg = &msgs[tptr]; dev_dbg(&adap->dev, "R: %d T: %d\n", rptr, tptr); From 3766a1abc53164f058d74f950599c797cb3fe302 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 25 Jun 2013 14:15:36 -0700 Subject: [PATCH 104/156] mm/thp: define HPAGE_PMD_* constants as BUILD_BUG() if !THP Currently, HPAGE_PMD_* constans rely on PMD_SHIFT regardless of CONFIG_TRANSPARENT_HUGEPAGE. PMD_SHIFT is not defined everywhere (e.g. arm nommu case). It means we can't use anything like this in generic code: if (PageTransHuge(page)) zero_huge_user(page, 0, HPAGE_PMD_SIZE); else clear_highpage(page); For !THP case, PageTransHuge() is 0 and compiler can eliminate zero_huge_user() call. But it still need to be valid C expression, means HPAGE_PMD_SIZE has to expand to something compiler can understand. Previously, HPAGE_PMD_* were defined to BUILD_BUG() for !THP. Let's come back to it. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- include/linux/huge_mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cc276d2c3a40..e2dbefb38e3b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -58,11 +58,12 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1< Date: Tue, 5 Mar 2013 17:52:49 +0200 Subject: [PATCH 105/156] powerpc/watchdog: Don't enable interrupt on PPC64 BookE Critical interrupts are not handled on PPC64 BookE machines, so when the first watchdog interrupt fires the machine will freeze without a warning until it's rebooted by the second watchdog trigger. Plus, the interrupt isn't used anyway since the driver expects a usermode app to ping the watchdog periodically. Signed-off-by: Laurentiu Tudor Signed-off-by: Scott Wood --- drivers/watchdog/booke_wdt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/watchdog/booke_wdt.c b/drivers/watchdog/booke_wdt.c index a8dbceb32914..f1b8d555080e 100644 --- a/drivers/watchdog/booke_wdt.c +++ b/drivers/watchdog/booke_wdt.c @@ -138,6 +138,14 @@ static void __booke_wdt_enable(void *data) val &= ~WDTP_MASK; val |= (TCR_WIE|TCR_WRC(WRC_CHIP)|WDTP(booke_wdt_period)); +#ifdef CONFIG_PPC_BOOK3E_64 + /* + * Crit ints are currently broken on PPC64 Book-E, so + * just disable them for now. + */ + val &= ~TCR_WIE; +#endif + mtspr(SPRN_TCR, val); } From e1b85c17bf3e4f2ecbf9ec824c4048a06078100b Mon Sep 17 00:00:00 2001 From: Sebastien Bessiere Date: Fri, 14 Jun 2013 17:57:03 +0200 Subject: [PATCH 106/156] trivial: powerpc: Fix typo in ioei_interrupt() description Signed-off-by: Sebastien Bessiere Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/io_event_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c index ef9d9d84c7d5..5ea88d1541f7 100644 --- a/arch/powerpc/platforms/pseries/io_event_irq.c +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -115,7 +115,7 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) * by scope or event type alone. For example, Torrent ISR route change * event is reported with scope 0x00 (Not Applicatable) rather than * 0x3B (Torrent-hub). It is better to let the clients to identify - * who owns the the event. + * who owns the event. */ static irqreturn_t ioei_interrupt(int irq, void *dev_id) From 80aa0fb4940bf8ee52bcb574d74459a7aea45621 Mon Sep 17 00:00:00 2001 From: James Yang Date: Tue, 25 Jun 2013 11:41:05 -0500 Subject: [PATCH 107/156] powerpc: Fix string instr. emulation for 32-bit processes on ppc64 String instruction emulation would erroneously result in a segfault if the upper bits of the EA are set and is so high that it fails access check. Truncate the EA to 32 bits if the process is 32-bit. Signed-off-by: James Yang Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/traps.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 071f6e040eb2..300daf3e7ab0 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -866,6 +866,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) u8 val; u32 shift = 8 * (3 - (pos & 0x3)); + /* if process is 32-bit, clear upper 32 bits of EA */ + if ((regs->msr & MSR_64BIT) == 0) + EA &= 0xFFFFFFFF; + switch ((instword & PPC_INST_STRING_MASK)) { case PPC_INST_LSWX: case PPC_INST_LSWI: From 090b9284d72561644d17a6e568d29c9e472c9865 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 28 Jun 2013 18:17:09 +1000 Subject: [PATCH 108/156] powerpc/tm: Clear MSR RI in non-recoverable TM code When we treclaim and trecheckpoint there's an unavoidable period when r1 will not be a valid kernel stack pointer. This patch clears the MSR recoverable interrupt (RI) bit over these regions to indicate we have an invalid kernel stack pointer. For treclaim, the region over which we clear MSR RI is larger than required to avoid the need for an extra costly mtmsrd. Thanks to Paulus for suggesting this change. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/tm.S | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 2da67e7a16d5..51be8fb24803 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -112,9 +112,18 @@ _GLOBAL(tm_reclaim) std r3, STACK_PARAM(0)(r1) SAVE_NVGPRS(r1) + /* We need to setup MSR for VSX register save instructions. Here we + * also clear the MSR RI since when we do the treclaim, we won't have a + * valid kernel pointer for a while. We clear RI here as it avoids + * adding another mtmsr closer to the treclaim. This makes the region + * maked as non-recoverable wider than it needs to be but it saves on + * inserting another mtmsrd later. + */ mfmsr r14 mr r15, r14 ori r15, r15, MSR_FP + li r16, MSR_RI + andc r15, r15, r16 oris r15, r15, MSR_VEC@h #ifdef CONFIG_VSX BEGIN_FTR_SECTION @@ -349,9 +358,10 @@ restore_gprs: mtcr r5 mtxer r6 - /* MSR and flags: We don't change CRs, and we don't need to alter - * MSR. + /* Clear the MSR RI since we are about to change R1. EE is already off */ + li r4, 0 + mtmsrd r4, 1 REST_4GPRS(0, r7) /* GPR0-3 */ REST_GPR(4, r7) /* GPR4-6 */ @@ -377,6 +387,10 @@ restore_gprs: GET_PACA(r13) GET_SCRATCH0(r1) + /* R1 is restored, so we are recoverable again. EE is still off */ + li r4, MSR_RI + mtmsrd r4, 1 + REST_NVGPRS(r1) addi r1, r1, TM_FRAME_SIZE From c35ae1796bd4865bad322645a7edb92d223dfb51 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:42 +0800 Subject: [PATCH 109/156] powerpc/eeh: Don't collect PCI-CFG data on PHB When the PHB is fenced or dead, it's pointless to collect the data from PCI config space of subordinate PCI devices since it should return 0xFF's. The patch also fixes overwritten buffer while getting PCI config data. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index e5796524631c..416fb432d7e2 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -232,16 +232,30 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) { size_t loglen = 0; struct eeh_dev *edev; + bool valid_cfg_log = true; - eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); - eeh_ops->configure_bridge(pe); - eeh_pe_restore_bars(pe); + /* + * When the PHB is fenced or dead, it's pointless to collect + * the data from PCI config space because it should return + * 0xFF's. For ER, we still retrieve the data from the PCI + * config space. + */ + if (eeh_probe_mode_dev() && + (pe->type & EEH_PE_PHB) && + (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD))) + valid_cfg_log = false; - pci_regs_buf[0] = 0; - eeh_pe_for_each_dev(pe, edev) { - loglen += eeh_gather_pci_data(edev, pci_regs_buf, - EEH_PCI_REGS_LOG_LEN); - } + if (valid_cfg_log) { + eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + + pci_regs_buf[0] = 0; + eeh_pe_for_each_dev(pe, edev) { + loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen, + EEH_PCI_REGS_LOG_LEN - loglen); + } + } eeh_ops->get_log(pe, severity, pci_regs_buf, loglen); } From 652defed48757ae0dcc851beeb8fdd484bad40c6 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:43 +0800 Subject: [PATCH 110/156] powerpc/eeh: Check PCIe link after reset After reset (e.g. complete reset) in order to bring the fenced PHB back, the PCIe link might not be ready yet. The patch intends to make sure the PCIe link is ready before accessing its subordinate PCI devices. The patch also fixes that wrong values restored to PCI_COMMAND register for PCI bridges. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh_pe.c | 157 ++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 55943fcdaeb8..016588a6f5ed 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -22,6 +22,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -567,30 +568,132 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state) eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); } -/** - * eeh_restore_one_device_bars - Restore the Base Address Registers for one device - * @data: EEH device - * @flag: Unused +/* + * Some PCI bridges (e.g. PLX bridges) have primary/secondary + * buses assigned explicitly by firmware, and we probably have + * lost that after reset. So we have to delay the check until + * the PCI-CFG registers have been restored for the parent + * bridge. * - * Loads the PCI configuration space base address registers, - * the expansion ROM base address, the latency timer, and etc. - * from the saved values in the device node. + * Don't use normal PCI-CFG accessors, which probably has been + * blocked on normal path during the stage. So we need utilize + * eeh operations, which is always permitted. */ -static void *eeh_restore_one_device_bars(void *data, void *flag) +static void eeh_bridge_check_link(struct pci_dev *pdev, + struct device_node *dn) +{ + int cap; + uint32_t val; + int timeout = 0; + + /* + * We only check root port and downstream ports of + * PCIe switches + */ + if (!pci_is_pcie(pdev) || + (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT && + pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM)) + return; + + pr_debug("%s: Check PCIe link for %s ...\n", + __func__, pci_name(pdev)); + + /* Check slot status */ + cap = pdev->pcie_cap; + eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val); + if (!(val & PCI_EXP_SLTSTA_PDS)) { + pr_debug(" No card in the slot (0x%04x) !\n", val); + return; + } + + /* Check power status if we have the capability */ + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val); + if (val & PCI_EXP_SLTCAP_PCP) { + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val); + if (val & PCI_EXP_SLTCTL_PCC) { + pr_debug(" In power-off state, power it on ...\n"); + val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); + val |= (0x0100 & PCI_EXP_SLTCTL_PIC); + eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val); + msleep(2 * 1000); + } + } + + /* Enable link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val); + val &= ~PCI_EXP_LNKCTL_LD; + eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val); + + /* Check link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val); + if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { + pr_debug(" No link reporting capability (0x%08x) \n", val); + msleep(1000); + return; + } + + /* Wait the link is up until timeout (5s) */ + timeout = 0; + while (timeout < 5000) { + msleep(20); + timeout += 20; + + eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val); + if (val & PCI_EXP_LNKSTA_DLLLA) + break; + } + + if (val & PCI_EXP_LNKSTA_DLLLA) + pr_debug(" Link up (%s)\n", + (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); + else + pr_debug(" Link not ready (0x%04x)\n", val); +} + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)]) + +static void eeh_restore_bridge_bars(struct pci_dev *pdev, + struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + + /* + * Device BARs: 0x10 - 0x18 + * Bus numbers and windows: 0x18 - 0x30 + */ + for (i = 4; i < 13; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* Rom: 0x38 */ + eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]); + + /* Cache line & Latency timer: 0xC 0xD */ + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + /* Max latency, min grant, interrupt ping and line: 0x3C */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* PCI Command: 0x4 */ + eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]); + + /* Check the PCIe link is ready */ + eeh_bridge_check_link(pdev, dn); +} + +static void eeh_restore_device_bars(struct eeh_dev *edev, + struct device_node *dn) { int i; u32 cmd; - struct eeh_dev *edev = (struct eeh_dev *)data; - struct device_node *dn = eeh_dev_to_of_node(edev); for (i = 4; i < 10; i++) eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); /* 12 == Expansion ROM Address */ eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]); -#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) -#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)]) - eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, SAVED_BYTE(PCI_CACHE_LINE_SIZE)); eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, @@ -613,6 +716,34 @@ static void *eeh_restore_one_device_bars(void *data, void *flag) else cmd &= ~PCI_COMMAND_SERR; eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd); +} + +/** + * eeh_restore_one_device_bars - Restore the Base Address Registers for one device + * @data: EEH device + * @flag: Unused + * + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static void *eeh_restore_one_device_bars(void *data, void *flag) +{ + struct pci_dev *pdev = NULL; + struct eeh_dev *edev = (struct eeh_dev *)data; + struct device_node *dn = eeh_dev_to_of_node(edev); + + /* Trace the PCI bridge */ + if (eeh_probe_mode_dev()) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev->hdr_type != PCI_HEADER_TYPE_BRIDGE) + pdev = NULL; + } + + if (pdev) + eeh_restore_bridge_bars(pdev, edev, dn); + else + eeh_restore_device_bars(edev, dn); return NULL; } From 0b9e267d71d2e74d1108785928fd8c8c9dbf441e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:44 +0800 Subject: [PATCH 111/156] powerpc/powernv: Replace variables with flags We have 2 fields in "struct pnv_phb" to trace the states. The patch replace the fields with one and introduces flags for that. The patch doesn't impact the logic. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-ioda.c | 8 ++++---- arch/powerpc/platforms/powernv/pci.c | 4 ++-- arch/powerpc/platforms/powernv/pci.h | 7 +++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 84f3036511cb..85025d7e6396 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -132,7 +132,7 @@ static int ioda_eeh_post_init(struct pci_controller *hose) &ioda_eeh_dbgfs_ops); #endif - phb->eeh_enabled = 1; + phb->eeh_state |= PNV_EEH_STATE_ENABLED; } return 0; @@ -815,7 +815,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) * removed, we needn't take care of it any more. */ phb = hose->private_data; - if (phb->removed) + if (phb->eeh_state & PNV_EEH_STATE_REMOVED) continue; rc = opal_pci_next_error(phb->opal_id, @@ -850,7 +850,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; - phb->removed = 1; + phb->eeh_state |= PNV_EEH_STATE_REMOVED; } WARN(1, "EEH: dead IOC detected\n"); @@ -867,7 +867,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) WARN(1, "EEH: dead PHB#%x detected\n", hose->global_number); - phb->removed = 1; + phb->eeh_state |= PNV_EEH_STATE_REMOVED; ret = 3; goto out; } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 577cbeadb0ea..4c91e6dd1af2 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -309,7 +309,7 @@ static int pnv_pci_read_config(struct pci_bus *bus, if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED)) return PCIBIOS_SUCCESSFUL; - if (phb->eeh_enabled) { + if (phb->eeh_state & PNV_EEH_STATE_ENABLED) { if (*val == EEH_IO_ERROR_VALUE(size)) { busdn = pci_bus_to_OF_node(bus); for (dn = busdn->child; dn; dn = dn->sibling) { @@ -359,7 +359,7 @@ static int pnv_pci_write_config(struct pci_bus *bus, /* Check if the PHB got frozen due to an error (no response) */ #ifdef CONFIG_EEH - if (!phb->eeh_enabled) + if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED)) pnv_pci_config_check_eeh(phb, bus, bdfn); #else pnv_pci_config_check_eeh(phb, bus, bdfn); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 43906e3ab26f..40bdf0219b96 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -78,6 +78,10 @@ struct pnv_eeh_ops { int (*configure_bridge)(struct eeh_pe *pe); int (*next_error)(struct eeh_pe **pe); }; + +#define PNV_EEH_STATE_ENABLED (1 << 0) /* EEH enabled */ +#define PNV_EEH_STATE_REMOVED (1 << 1) /* PHB removed */ + #endif /* CONFIG_EEH */ struct pnv_phb { @@ -92,8 +96,7 @@ struct pnv_phb { #ifdef CONFIG_EEH struct pnv_eeh_ops *eeh_ops; - int eeh_enabled; - int removed; + int eeh_state; #endif #ifdef CONFIG_DEBUG_FS From 88b6d14b2bb48ea4f66fedfe671f98544395b305 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:45 +0800 Subject: [PATCH 112/156] powerpc/eeh: Fix address catch for PowerNV On the PowerNV platform, the EEH address cache isn't built correctly because we skipped the EEH devices without binding PE. The patch fixes that. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh_cache.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 1d5d9a6ba740..858ebeaa2bac 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -194,7 +194,7 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) } /* Skip any devices for which EEH is not enabled. */ - if (!edev->pe) { + if (!eeh_probe_mode_dev() && !edev->pe) { #ifdef DEBUG pr_info("PCI: skip building address cache for=%s - %s\n", pci_name(dev), dn->full_name); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index dc4ec7956967..c393bf59f113 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -999,6 +999,7 @@ static void pnv_pci_ioda_fixup(void) pnv_pci_ioda_create_dbgfs(); #ifdef CONFIG_EEH + eeh_probe_mode_set(EEH_PROBE_MODE_DEV); eeh_addr_cache_build(); eeh_init(); #endif From 56ca4fde90009094b1a46971de3879d5f2dd724e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:46 +0800 Subject: [PATCH 113/156] powerpc/eeh: Refactor the output message We needn't the the whole backtrace other than one-line message in the error reporting interrupt handler. For errors triggered by access PCI config space or MMIO, we replace "WARN(1, ...)" with pr_err() and dump_stack(). The patch also adds more output messages to indicate what EEH core is doing. Besides, some printk() are replaced with pr_warning(). Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/eeh.c | 9 ++++++-- arch/powerpc/kernel/eeh_driver.c | 23 ++++++++++++++++----- arch/powerpc/platforms/powernv/eeh-ioda.c | 25 +++++++++++++++-------- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 416fb432d7e2..3a8f82fd9005 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -329,7 +329,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) eeh_serialize_unlock(flags); eeh_send_failure_event(phb_pe); - WARN(1, "EEH: PHB failure detected\n"); + pr_err("EEH: PHB#%x failure detected\n", + phb_pe->phb->global_number); + dump_stack(); return 1; out: @@ -458,7 +460,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ - WARN(1, "EEH: failure detected\n"); + pr_err("EEH: Frozen PE#%x detected on PHB#%x\n", + pe->addr, pe->phb->global_number); + dump_stack(); + return 1; dn_unlock: diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 0974e1326842..2b1ce17cae50 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -425,6 +425,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) * status ... if any child can't handle the reset, then the entire * slot is dlpar removed and added. */ + pr_info("EEH: Notify device drivers to shutdown\n"); eeh_pe_dev_traverse(pe, eeh_report_error, &result); /* Get the current PCI slot state. This can take a long time, @@ -432,7 +433,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) */ rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - printk(KERN_WARNING "EEH: Permanent failure\n"); + pr_warning("EEH: Permanent failure\n"); goto hard_fail; } @@ -440,6 +441,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) * don't post the error log until after all dev drivers * have been informed. */ + pr_info("EEH: Collect temporary log\n"); eeh_slot_error_detail(pe, EEH_LOG_TEMP); /* If all device drivers were EEH-unaware, then shut @@ -447,15 +449,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) * go down willingly, without panicing the system. */ if (result == PCI_ERS_RESULT_NONE) { + pr_info("EEH: Reset with hotplug activity\n"); rc = eeh_reset_device(pe, frozen_bus); if (rc) { - printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); + pr_warning("%s: Unable to reset, err=%d\n", + __func__, rc); goto hard_fail; } } /* If all devices reported they can proceed, then re-enable MMIO */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); if (rc < 0) @@ -463,6 +468,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { + pr_info("EEH: Notify device drivers to resume I/O\n"); result = PCI_ERS_RESULT_NONE; eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result); } @@ -470,6 +476,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); if (rc < 0) @@ -482,17 +489,22 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) /* If any device has a hard failure, then shut off everything. */ if (result == PCI_ERS_RESULT_DISCONNECT) { - printk(KERN_WARNING "EEH: Device driver gave up\n"); + pr_warning("EEH: Device driver gave up\n"); goto hard_fail; } /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { + pr_info("EEH: Reset without hotplug activity\n"); rc = eeh_reset_device(pe, NULL); if (rc) { - printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); + pr_warning("%s: Cannot reset, err=%d\n", + __func__, rc); goto hard_fail; } + + pr_info("EEH: Notify device drivers " + "the completion of reset\n"); result = PCI_ERS_RESULT_NONE; eeh_pe_dev_traverse(pe, eeh_report_reset, &result); } @@ -500,11 +512,12 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) /* All devices should claim they have recovered by now. */ if ((result != PCI_ERS_RESULT_RECOVERED) && (result != PCI_ERS_RESULT_NONE)) { - printk(KERN_WARNING "EEH: Not recovered\n"); + pr_warning("EEH: Not recovered\n"); goto hard_fail; } /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); return; diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 85025d7e6396..0cd1c4a71755 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -853,11 +853,14 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) phb->eeh_state |= PNV_EEH_STATE_REMOVED; } - WARN(1, "EEH: dead IOC detected\n"); + pr_err("EEH: dead IOC detected\n"); ret = 4; goto out; - } else if (severity == OPAL_EEH_SEV_INF) + } else if (severity == OPAL_EEH_SEV_INF) { + pr_info("EEH: IOC informative error " + "detected\n"); ioda_eeh_hub_diag(hose); + } break; case OPAL_EEH_PHB_ERROR: @@ -865,8 +868,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) if (ioda_eeh_get_phb_pe(hose, pe)) break; - WARN(1, "EEH: dead PHB#%x detected\n", - hose->global_number); + pr_err("EEH: dead PHB#%x detected\n", + hose->global_number); phb->eeh_state |= PNV_EEH_STATE_REMOVED; ret = 3; goto out; @@ -874,20 +877,24 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) if (ioda_eeh_get_phb_pe(hose, pe)) break; - WARN(1, "EEH: fenced PHB#%x detected\n", - hose->global_number); + pr_err("EEH: fenced PHB#%x detected\n", + hose->global_number); ret = 2; goto out; - } else if (severity == OPAL_EEH_SEV_INF) + } else if (severity == OPAL_EEH_SEV_INF) { + pr_info("EEH: PHB#%x informative error " + "detected\n", + hose->global_number); ioda_eeh_phb_diag(hose); + } break; case OPAL_EEH_PE_ERROR: if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) break; - WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n", - (*pe)->addr, (*pe)->phb->global_number); + pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", + (*pe)->addr, (*pe)->phb->global_number); ret = 1; goto out; } From eeb6361fdd3df59c1741522b3d8102f0f5efdd88 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:47 +0800 Subject: [PATCH 114/156] powerpc/eeh: Avoid build warnings The patch is for avoiding following build warnings: The function .pnv_pci_ioda_fixup() references the function __init .eeh_init(). This is often because .pnv_pci_ioda_fixup lacks a __init The function .pnv_pci_ioda_fixup() references the function __init .eeh_addr_cache_build(). This is often because .pnv_pci_ioda_fixup lacks a __init Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 4 ++-- arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/kernel/eeh_cache.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index dd65e311c93a..09a8743143f3 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -202,13 +202,13 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); void *eeh_dev_init(struct device_node *dn, void *data); void eeh_dev_phb_init_dynamic(struct pci_controller *phb); -int __init eeh_init(void); +int eeh_init(void); int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val); int eeh_dev_check_failure(struct eeh_dev *edev); -void __init eeh_addr_cache_build(void); +void eeh_addr_cache_build(void); void eeh_add_device_tree_early(struct device_node *); void eeh_add_device_tree_late(struct pci_bus *); void eeh_add_sysfs_files(struct pci_bus *); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3a8f82fd9005..39954fe941b8 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -756,7 +756,7 @@ int __exit eeh_ops_unregister(const char *name) * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -int __init eeh_init(void) +int eeh_init(void) { struct pci_controller *hose, *tmp; struct device_node *phb; diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 858ebeaa2bac..ea9a94cc97d2 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -285,7 +285,7 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev) * Must be run late in boot process, after the pci controllers * have been scanned for devices (after all device resources are known). */ -void __init eeh_addr_cache_build(void) +void eeh_addr_cache_build(void) { struct device_node *dn; struct eeh_dev *edev; From 9bf41be6737327b7c06cd3f210a0cb599f4aa790 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 27 Jun 2013 13:46:48 +0800 Subject: [PATCH 115/156] powerpc/powernv: Use dev-node in PCI config accessors Currently, we're using the combo (PCI bus + devfn) in the PCI config accessors and PCI config accessors in EEH depends on them. However, it's not safe to refer the PCI bus which might have been removed during hotplug. So we're using device node in the PCI config accessors and the corresponding backends just reuse them. The patch also fix one potential risk: We possiblly have frozen PE during the early PCI probe time, but we haven't setup the PE mapping yet. So the errors should be counted to PE#0. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powernv/eeh-powernv.c | 44 +------ arch/powerpc/platforms/powernv/pci.c | 120 +++++++++++-------- arch/powerpc/platforms/powernv/pci.h | 4 + 3 files changed, 79 insertions(+), 89 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 9559115424d6..969cce73055a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -314,46 +314,6 @@ static int powernv_eeh_configure_bridge(struct eeh_pe *pe) return ret; } -/** - * powernv_eeh_read_config - Read PCI config space - * @dn: device node - * @where: PCI address - * @size: size to read - * @val: return value - * - * Read config space from the speicifed device - */ -static int powernv_eeh_read_config(struct device_node *dn, int where, - int size, u32 *val) -{ - struct eeh_dev *edev = of_node_to_eeh_dev(dn); - struct pci_dev *dev = eeh_dev_to_pci_dev(edev); - struct pci_controller *hose = edev->phb; - - return hose->ops->read(dev->bus, dev->devfn, where, size, val); -} - -/** - * powernv_eeh_write_config - Write PCI config space - * @dn: device node - * @where: PCI address - * @size: size to write - * @val: value to be written - * - * Write config space to the specified device - */ -static int powernv_eeh_write_config(struct device_node *dn, int where, - int size, u32 val) -{ - struct eeh_dev *edev = of_node_to_eeh_dev(dn); - struct pci_dev *dev = eeh_dev_to_pci_dev(edev); - struct pci_controller *hose = edev->phb; - - hose = pci_bus_to_host(dev->bus); - - return hose->ops->write(dev->bus, dev->devfn, where, size, val); -} - /** * powernv_eeh_next_error - Retrieve next EEH error to handle * @pe: Affected PE @@ -389,8 +349,8 @@ static struct eeh_ops powernv_eeh_ops = { .wait_state = powernv_eeh_wait_state, .get_log = powernv_eeh_get_log, .configure_bridge = powernv_eeh_configure_bridge, - .read_config = powernv_eeh_read_config, - .write_config = powernv_eeh_write_config, + .read_config = pnv_pci_cfg_read, + .write_config = pnv_pci_cfg_write, .next_error = powernv_eeh_next_error }; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 4c91e6dd1af2..a28d3b5e6393 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -231,47 +231,50 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_unlock_irqrestore(&phb->lock, flags); } -static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus, - u32 bdfn) +static void pnv_pci_config_check_eeh(struct pnv_phb *phb, + struct device_node *dn) { s64 rc; u8 fstate; u16 pcierr; u32 pe_no; - /* Get PE# if we support IODA */ - pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0; + /* + * Get the PE#. During the PCI probe stage, we might not + * setup that yet. So all ER errors should be mapped to + * PE#0 + */ + pe_no = PCI_DN(dn)->pe_number; + if (pe_no == IODA_INVALID_PE) + pe_no = 0; /* Read freeze status */ rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr, NULL); if (rc) { - pr_warning("PCI %d: Failed to read EEH status for PE#%d," - " err %lld\n", phb->hose->global_number, pe_no, rc); + pr_warning("%s: Can't read EEH status (PE#%d) for " + "%s, err %lld\n", + __func__, pe_no, dn->full_name, rc); return; } - cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n", - bdfn, pe_no, fstate); + cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", + (PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn), + pe_no, fstate); if (fstate != 0) pnv_pci_handle_eeh_config(phb, pe_no); } -static int pnv_pci_read_config(struct pci_bus *bus, - unsigned int devfn, - int where, int size, u32 *val) +int pnv_pci_cfg_read(struct device_node *dn, + int where, int size, u32 *val) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = PCI_DN(dn); + struct pnv_phb *phb = pdn->phb->private_data; + u32 bdfn = (pdn->busno << 8) | pdn->devfn; #ifdef CONFIG_EEH - struct device_node *busdn, *dn; struct eeh_pe *phb_pe = NULL; #endif - u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; s64 rc; - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - switch (size) { case 1: { u8 v8; @@ -295,8 +298,8 @@ static int pnv_pci_read_config(struct pci_bus *bus, default: return PCIBIOS_FUNC_NOT_SUPPORTED; } - cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n", - bus->number, devfn, where, size, *val); + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, *val); /* * Check if the specified PE has been put into frozen @@ -305,44 +308,33 @@ static int pnv_pci_read_config(struct pci_bus *bus, * PHB-fatal errors. */ #ifdef CONFIG_EEH - phb_pe = eeh_phb_pe_get(hose); + phb_pe = eeh_phb_pe_get(pdn->phb); if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED)) return PCIBIOS_SUCCESSFUL; if (phb->eeh_state & PNV_EEH_STATE_ENABLED) { - if (*val == EEH_IO_ERROR_VALUE(size)) { - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - struct pci_dn *pdn = PCI_DN(dn); - - if (pdn && pdn->devfn == devfn && - eeh_dev_check_failure(of_node_to_eeh_dev(dn))) - return PCIBIOS_DEVICE_NOT_FOUND; - } - } + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; } else { - pnv_pci_config_check_eeh(phb, bus, bdfn); + pnv_pci_config_check_eeh(phb, dn); } #else - pnv_pci_config_check_eeh(phb, bus, bdfn); + pnv_pci_config_check_eeh(phb, dn); #endif return PCIBIOS_SUCCESSFUL; } -static int pnv_pci_write_config(struct pci_bus *bus, - unsigned int devfn, - int where, int size, u32 val) +int pnv_pci_cfg_write(struct device_node *dn, + int where, int size, u32 val) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; - u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; + struct pci_dn *pdn = PCI_DN(dn); + struct pnv_phb *phb = pdn->phb->private_data; + u32 bdfn = (pdn->busno << 8) | pdn->devfn; - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - - cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n", - bus->number, devfn, where, size, val); + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + pdn->busno, pdn->devfn, where, size, val); switch (size) { case 1: opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); @@ -360,16 +352,50 @@ static int pnv_pci_write_config(struct pci_bus *bus, /* Check if the PHB got frozen due to an error (no response) */ #ifdef CONFIG_EEH if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED)) - pnv_pci_config_check_eeh(phb, bus, bdfn); + pnv_pci_config_check_eeh(phb, dn); #else - pnv_pci_config_check_eeh(phb, bus, bdfn); + pnv_pci_config_check_eeh(phb, dn); #endif return PCIBIOS_SUCCESSFUL; } +static int pnv_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); + struct pci_dn *pdn; + + for (dn = busdn->child; dn; dn = dn->sibling) { + pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn) + return pnv_pci_cfg_read(dn, where, size, val); + } + + *val = 0xFFFFFFFF; + return PCIBIOS_DEVICE_NOT_FOUND; + +} + +static int pnv_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); + struct pci_dn *pdn; + + for (dn = busdn->child; dn; dn = dn->sibling) { + pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn) + return pnv_pci_cfg_write(dn, where, size, val); + } + + return PCIBIOS_DEVICE_NOT_FOUND; +} + struct pci_ops pnv_pci_ops = { - .read = pnv_pci_read_config, + .read = pnv_pci_read_config, .write = pnv_pci_write_config, }; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 40bdf0219b96..d633c64e05a1 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -182,6 +182,10 @@ extern struct pci_ops pnv_pci_ops; extern struct pnv_eeh_ops ioda_eeh_ops; #endif +int pnv_pci_cfg_read(struct device_node *dn, + int where, int size, u32 *val); +int pnv_pci_cfg_write(struct device_node *dn, + int where, int size, u32 val); extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset); From 4bb297113433048169c30a32c1e58b6a1b61b621 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sun, 30 Jun 2013 22:00:42 +0300 Subject: [PATCH 116/156] powerpc/windfarm: Fix overtemperature clearing With pm81/pm91/pm121, when the overtemperature state is entered, and when it remains on after skipped ticks, the driver will try to leave it too soon (immediately on the next tick). This is because the active FAILURE_OVERTEMP state is not visible in "new_failure" variable of the current tick. Furthermore, the driver will keep trying to clear condition in subsequent ticks as FAILURE_OVERTEMP remains set in the "last_failure" variable. These will start to trigger WARNINGS from windfarm core: [ 100.082735] windfarm: Clamping CPU frequency to minimum ! [ 100.108132] windfarm: Overtemp condition detected ! [ 101.952908] windfarm: Overtemp condition cleared ! [...] [ 102.980388] WARNING: at drivers/macintosh/windfarm_core.c:463 [...] [ 103.982227] WARNING: at drivers/macintosh/windfarm_core.c:463 [...] [ 105.030494] WARNING: at drivers/macintosh/windfarm_core.c:463 [...] [ 105.973666] WARNING: at drivers/macintosh/windfarm_core.c:463 [...] [ 106.977913] WARNING: at drivers/macintosh/windfarm_core.c:463 Fix by adding a helper global variable. We leave the overtemp state only after all failure bits have been cleared. I saw this error on iMac G5 iSight (pm121). Also pm81/pm91 are fixed based on the observation that these are almost identical/copy-pasted code. Signed-off-by: Aaro Koskinen Signed-off-by: Benjamin Herrenschmidt --- drivers/macintosh/windfarm_pm121.c | 6 +++++- drivers/macintosh/windfarm_pm81.c | 6 +++++- drivers/macintosh/windfarm_pm91.c | 6 +++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c index af605e915d41..7fe58b0ae8b4 100644 --- a/drivers/macintosh/windfarm_pm121.c +++ b/drivers/macintosh/windfarm_pm121.c @@ -276,6 +276,7 @@ static const char *loop_names[N_LOOPS] = { static unsigned int pm121_failure_state; static int pm121_readjust, pm121_skipping; +static bool pm121_overtemp; static s32 average_power; struct pm121_correction { @@ -847,6 +848,7 @@ static void pm121_tick(void) if (new_failure & FAILURE_OVERTEMP) { wf_set_overtemp(); pm121_skipping = 2; + pm121_overtemp = true; } /* We only clear the overtemp condition if overtemp is cleared @@ -855,8 +857,10 @@ static void pm121_tick(void) * the control loop levels, but we don't want to keep it clear * here in this case */ - if (new_failure == 0 && last_failure & FAILURE_OVERTEMP) + if (!pm121_failure_state && pm121_overtemp) { wf_clear_overtemp(); + pm121_overtemp = false; + } } diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c index f84933ff3298..2a5e1b15b1d2 100644 --- a/drivers/macintosh/windfarm_pm81.c +++ b/drivers/macintosh/windfarm_pm81.c @@ -149,6 +149,7 @@ static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started; static unsigned int wf_smu_failure_state; static int wf_smu_readjust, wf_smu_skipping; +static bool wf_smu_overtemp; /* * ****** System Fans Control Loop ****** @@ -593,6 +594,7 @@ static void wf_smu_tick(void) if (new_failure & FAILURE_OVERTEMP) { wf_set_overtemp(); wf_smu_skipping = 2; + wf_smu_overtemp = true; } /* We only clear the overtemp condition if overtemp is cleared @@ -601,8 +603,10 @@ static void wf_smu_tick(void) * the control loop levels, but we don't want to keep it clear * here in this case */ - if (new_failure == 0 && last_failure & FAILURE_OVERTEMP) + if (!wf_smu_failure_state && wf_smu_overtemp) { wf_clear_overtemp(); + wf_smu_overtemp = false; + } } static void wf_smu_new_control(struct wf_control *ct) diff --git a/drivers/macintosh/windfarm_pm91.c b/drivers/macintosh/windfarm_pm91.c index 2eb484f213c8..a8ac66cd3b13 100644 --- a/drivers/macintosh/windfarm_pm91.c +++ b/drivers/macintosh/windfarm_pm91.c @@ -76,6 +76,7 @@ static struct wf_control *cpufreq_clamp; /* Set to kick the control loop into life */ static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started; +static bool wf_smu_overtemp; /* Failure handling.. could be nicer */ #define FAILURE_FAN 0x01 @@ -517,6 +518,7 @@ static void wf_smu_tick(void) if (new_failure & FAILURE_OVERTEMP) { wf_set_overtemp(); wf_smu_skipping = 2; + wf_smu_overtemp = true; } /* We only clear the overtemp condition if overtemp is cleared @@ -525,8 +527,10 @@ static void wf_smu_tick(void) * the control loop levels, but we don't want to keep it clear * here in this case */ - if (new_failure == 0 && last_failure & FAILURE_OVERTEMP) + if (!wf_smu_failure_state && wf_smu_overtemp) { wf_clear_overtemp(); + wf_smu_overtemp = false; + } } From 348c2298a6fd2b145e789739808d5e7598e275fc Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 27 Jun 2013 09:09:43 +0800 Subject: [PATCH 117/156] powerpc: Don't flush/invalidate the d/icache for an unknown relocation type For an unknown relocation type since the value of r4 is just the 8bit relocation type, the sum of r4 and r7 may yield an invalid memory address. For example: In normal case: r4 = c00xxxxx r7 = 40000000 r4 + r7 = 000xxxxx For an unknown relocation type: r4 = 000000xx r7 = 40000000 r4 + r7 = 400000xx 400000xx is an invalid memory address for a board which has just 512M memory. And for operations such as dcbst or icbi may cause bus error for an invalid memory address on some platforms and then cause the board reset. So we should skip the flush/invalidate the d/icache for an unknown relocation type. Signed-off-by: Kevin Hao Acked-by: Suzuki K. Poulose Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/reloc_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index ef46ba6e094f..f366fedb0872 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -166,7 +166,7 @@ ha16: /* R_PPC_ADDR16_LO */ lo16: cmpwi r4, R_PPC_ADDR16_LO - bne nxtrela + bne unknown_type lwz r4, 0(r9) /* r_offset */ lwz r0, 8(r9) /* r_addend */ add r0, r0, r3 @@ -191,6 +191,7 @@ nxtrela: dcbst r4,r7 sync /* Ensure the data is flushed before icbi */ icbi r4,r7 +unknown_type: cmpwi r8, 0 /* relasz = 0 ? */ ble done add r9, r9, r6 /* move to next entry in the .rela table */ From 5524f3fc069b6435f9ff0db573e3a8b5082ef528 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 11 Jun 2013 13:57:05 -0600 Subject: [PATCH 118/156] powerpc/iommu: Remove unused pci_iommu_init() and pci_direct_iommu_init() pci_iommu_init() and pci_direct_iommu_init() are not referenced anywhere, so remove them. Signed-off-by: Bjorn Helgaas Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/iommu.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 98d14229f893..c34656a8925e 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -130,13 +130,6 @@ extern void iommu_init_early_pSeries(void); extern void iommu_init_early_dart(void); extern void iommu_init_early_pasemi(void); -#ifdef CONFIG_PCI -extern void pci_iommu_init(void); -extern void pci_direct_iommu_init(void); -#else -static inline void pci_iommu_init(void) { } -#endif - extern void alloc_dart_table(void); #if defined(CONFIG_PPC64) && defined(CONFIG_PM) static inline void iommu_save(void) From cc293bf7a9aa6fad68d107c79705732bd2fc57e3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 13 Jun 2013 19:37:30 -0700 Subject: [PATCH 119/156] powerpc/idle: Convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 939ea7ef0dc8..d7216c9abda1 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -85,7 +85,7 @@ int powersave_nap; /* * Register the sysctl to set/clear powersave_nap. */ -static ctl_table powersave_nap_ctl_table[]={ +static struct ctl_table powersave_nap_ctl_table[] = { { .procname = "powersave-nap", .data = &powersave_nap, @@ -95,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={ }, {} }; -static ctl_table powersave_nap_sysctl_root[] = { +static struct ctl_table powersave_nap_sysctl_root[] = { { .procname = "kernel", .mode = 0555, From 5eb969d0e8b8f38fca0b2c6c76f5dca01449664a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 13 Jun 2013 19:37:37 -0700 Subject: [PATCH 120/156] macintosh: Convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Benjamin Herrenschmidt --- drivers/macintosh/mac_hid.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 6a82388505f0..80d30e8e3389 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -181,7 +181,7 @@ static void mac_hid_stop_emulation(void) mac_hid_destroy_emumouse(); } -static int mac_hid_toggle_emumouse(ctl_table *table, int write, +static int mac_hid_toggle_emumouse(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -214,7 +214,7 @@ static int mac_hid_toggle_emumouse(ctl_table *table, int write, } /* file(s) in /proc/sys/dev/mac_hid */ -static ctl_table mac_hid_files[] = { +static struct ctl_table mac_hid_files[] = { { .procname = "mouse_button_emulation", .data = &mouse_emulate_buttons, @@ -240,7 +240,7 @@ static ctl_table mac_hid_files[] = { }; /* dir in /proc/sys/dev */ -static ctl_table mac_hid_dir[] = { +static struct ctl_table mac_hid_dir[] = { { .procname = "mac_hid", .maxlen = 0, @@ -251,7 +251,7 @@ static ctl_table mac_hid_dir[] = { }; /* /proc/sys/dev itself, in case that is not there yet */ -static ctl_table mac_hid_root_dir[] = { +static struct ctl_table mac_hid_root_dir[] = { { .procname = "dev", .maxlen = 0, From 061d19f279f9bebbdb1ee48bef8c25e03de32ae2 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 24 Jun 2013 15:30:09 -0400 Subject: [PATCH 121/156] powerpc: Delete __cpuinit usage from all users The __cpuinit type of throwaway sections might have made sense some time ago when RAM was more constrained, but now the savings do not offset the cost and complications. For example, the fix in commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time") is a good example of the nasty type of bugs that can be created with improper use of the various __init prefixes. After a discussion on LKML[1] it was decided that cpuinit should go the way of devinit and be phased out. Once all the users are gone, we can then finally remove the macros themselves from linux/init.h. This removes all the powerpc uses of the __cpuinit macros. There are no __CPUINIT users in assembly files in powerpc. [1] https://lkml.org/lkml/2013/5/20/589 Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Josh Boyer Cc: Matt Porter Cc: Kumar Gala Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Paul Gortmaker Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/rtas.h | 4 +-- arch/powerpc/include/asm/vdso.h | 2 +- arch/powerpc/kernel/cacheinfo.c | 36 +++++++++++++++----------- arch/powerpc/kernel/rtas.c | 4 +-- arch/powerpc/kernel/smp.c | 4 +-- arch/powerpc/kernel/sysfs.c | 6 ++--- arch/powerpc/kernel/time.c | 1 - arch/powerpc/kernel/vdso.c | 2 +- arch/powerpc/mm/44x_mmu.c | 6 ++--- arch/powerpc/mm/hash_utils_64.c | 2 +- arch/powerpc/mm/mmu_context_nohash.c | 6 ++--- arch/powerpc/mm/numa.c | 7 +++-- arch/powerpc/mm/tlb_nohash.c | 2 +- arch/powerpc/perf/core-book3s.c | 4 +-- arch/powerpc/platforms/44x/currituck.c | 4 +-- arch/powerpc/platforms/44x/iss4xx.c | 4 +-- arch/powerpc/platforms/85xx/smp.c | 6 ++--- arch/powerpc/platforms/powermac/smp.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- 19 files changed, 54 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 34fd70488d83..c7a8bfc9f6f5 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -350,8 +350,8 @@ static inline u32 rtas_config_addr(int busno, int devfn, int reg) (devfn << 8) | (reg & 0xff); } -extern void __cpuinit rtas_give_timebase(void); -extern void __cpuinit rtas_take_timebase(void); +extern void rtas_give_timebase(void); +extern void rtas_take_timebase(void); #ifdef CONFIG_PPC_RTAS static inline int page_is_rtas_user_buf(unsigned long pfn) diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 50f261bc3e95..0d9cecddf8a4 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -22,7 +22,7 @@ extern unsigned long vdso64_rt_sigtramp; extern unsigned long vdso32_sigtramp; extern unsigned long vdso32_rt_sigtramp; -int __cpuinit vdso_getcpu_init(void); +int vdso_getcpu_init(void); #else /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 92c6b008dd2b..9262cf2bec4b 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -131,7 +131,8 @@ static const char *cache_type_string(const struct cache *cache) return cache_type_info[cache->type].name; } -static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode) +static void cache_init(struct cache *cache, int type, int level, + struct device_node *ofnode) { cache->type = type; cache->level = level; @@ -140,7 +141,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc list_add(&cache->list, &cache_list); } -static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, struct device_node *ofnode) { struct cache *cache; @@ -324,7 +325,8 @@ static bool cache_node_is_unified(const struct device_node *np) return of_get_property(np, "cache-unified", NULL); } -static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, + int level) { struct cache *cache; @@ -335,7 +337,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node * return cache; } -static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_split(struct device_node *node, + int level) { struct cache *dcache, *icache; @@ -357,7 +360,7 @@ err: return NULL; } -static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int level) { struct cache *cache; @@ -369,7 +372,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in return cache; } -static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level) +static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int level) { struct cache *cache; @@ -385,7 +389,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n return cache; } -static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger) +static void link_cache_lists(struct cache *smaller, struct cache *bigger) { while (smaller->next_local) { if (smaller->next_local == bigger) @@ -396,13 +400,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg smaller->next_local = bigger; } -static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache) +static void do_subsidiary_caches_debugcheck(struct cache *cache) { WARN_ON_ONCE(cache->level != 1); WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu")); } -static void __cpuinit do_subsidiary_caches(struct cache *cache) +static void do_subsidiary_caches(struct cache *cache) { struct device_node *subcache_node; int level = cache->level; @@ -423,7 +427,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache) } } -static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id) +static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; @@ -448,7 +452,7 @@ out: return cpu_cache; } -static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id) +static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id) { struct cache_dir *cache_dir; struct device *dev; @@ -653,7 +657,7 @@ static struct kobj_type cache_index_type = { .default_attrs = cache_index_default_attrs, }; -static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) +static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) { const char *cache_name; const char *cache_type; @@ -696,7 +700,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d kfree(buf); } -static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir) +static void cacheinfo_create_index_dir(struct cache *cache, int index, + struct cache_dir *cache_dir) { struct cache_index_dir *index_dir; int rc; @@ -722,7 +727,8 @@ err: kfree(index_dir); } -static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list) +static void cacheinfo_sysfs_populate(unsigned int cpu_id, + struct cache *cache_list) { struct cache_dir *cache_dir; struct cache *cache; @@ -740,7 +746,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache } } -void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id) +void cacheinfo_cpu_online(unsigned int cpu_id) { struct cache *cache; diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 52add6f3e201..80b5ef403f68 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1172,7 +1172,7 @@ int __init early_init_dt_scan_rtas(unsigned long node, static arch_spinlock_t timebase_lock; static u64 timebase = 0; -void __cpuinit rtas_give_timebase(void) +void rtas_give_timebase(void) { unsigned long flags; @@ -1189,7 +1189,7 @@ void __cpuinit rtas_give_timebase(void) local_irq_restore(flags); } -void __cpuinit rtas_take_timebase(void) +void rtas_take_timebase(void) { while (!timebase) barrier(); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ee7ac5e6e28a..85398c71665c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -480,7 +480,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) secondary_ti = current_set[cpu] = ti; } -int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc, c; @@ -610,7 +610,7 @@ static struct device_node *cpu_to_l2cache(int cpu) } /* Activate a secondary processor. */ -__cpuinit void start_secondary(void *unused) +void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); struct device_node *l2_cache; diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e68a84568b8b..27a90b99ef67 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -341,7 +341,7 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ -static void __cpuinit register_cpu_online(unsigned int cpu) +static void register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; @@ -502,7 +502,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count) #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, +static int sysfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; @@ -522,7 +522,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata sysfs_cpu_nb = { +static struct notifier_block sysfs_cpu_nb = { .notifier_call = sysfs_cpu_notify, }; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 5fc29ad7e26f..65ab9e909377 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -631,7 +631,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val) return found; } -/* should become __cpuinit when secondary_cpu_time_init also is */ void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d4f463ac65b1..1d9c92621b36 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -711,7 +711,7 @@ static void __init vdso_setup_syscall_map(void) } #ifdef CONFIG_PPC64 -int __cpuinit vdso_getcpu_init(void) +int vdso_getcpu_init(void) { unsigned long cpu, node, val; diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index 2c9441ee6bb8..82b1ff759e26 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -41,7 +41,7 @@ int icache_44x_need_flush; unsigned long tlb_47x_boltmap[1024/8]; -static void __cpuinit ppc44x_update_tlb_hwater(void) +static void ppc44x_update_tlb_hwater(void) { extern unsigned int tlb_44x_patch_hwater_D[]; extern unsigned int tlb_44x_patch_hwater_I[]; @@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void) /* * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU */ -static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) { unsigned int rA; int bolted; @@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, } #ifdef CONFIG_SMP -void __cpuinit mmu_init_secondary(int cpu) +void mmu_init_secondary(int cpu) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 845231643987..6ecc38bd5b24 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -807,7 +807,7 @@ void __init early_init_mmu(void) } #ifdef CONFIG_SMP -void __cpuinit early_init_mmu_secondary(void) +void early_init_mmu_secondary(void) { /* Initialize hash table for that CPU */ if (!firmware_has_feature(FW_FEATURE_LPAR)) diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 810f8e4d74d4..af3d78e19302 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -332,8 +332,8 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SMP -static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int mmu_context_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; @@ -366,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata mmu_context_cpu_nb = { +static struct notifier_block mmu_context_cpu_nb = { .notifier_call = mmu_context_cpu_notify, }; diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 88c0425dc0a8..c792cd95e792 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -516,7 +516,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, * Figure out to which domain a cpu belongs and stick it there. * Return the id of the domain used. */ -static int __cpuinit numa_setup_cpu(unsigned long lcpu) +static int numa_setup_cpu(unsigned long lcpu) { int nid = 0; struct device_node *cpu = of_get_cpu_node(lcpu, NULL); @@ -538,8 +538,7 @@ out: return nid; } -static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, - unsigned long action, +static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned long lcpu = (unsigned long)hcpu; @@ -919,7 +918,7 @@ static void __init *careful_zallocation(int nid, unsigned long size, return ret; } -static struct notifier_block __cpuinitdata ppc64_numa_nb = { +static struct notifier_block ppc64_numa_nb = { .notifier_call = cpu_numa_callback, .priority = 1 /* Must run before sched domains notifier. */ }; diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 6888cad5103d..41cd68dee681 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -648,7 +648,7 @@ void __init early_init_mmu(void) __early_init_mmu(1); } -void __cpuinit early_init_mmu_secondary(void) +void early_init_mmu_secondary(void) { __early_init_mmu(0); } diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 29c6482890c8..af94a717c451 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1786,7 +1786,7 @@ static void power_pmu_setup(int cpu) cpuhw->mmcr[0] = MMCR0_FC; } -static int __cpuinit +static int power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -1803,7 +1803,7 @@ power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu return NOTIFY_OK; } -int __cpuinit register_power_pmu(struct power_pmu *pmu) +int register_power_pmu(struct power_pmu *pmu) { if (ppmu) return -EBUSY; /* something's already registered */ diff --git a/arch/powerpc/platforms/44x/currituck.c b/arch/powerpc/platforms/44x/currituck.c index c52e1b3c9be5..7f1b71a01c6a 100644 --- a/arch/powerpc/platforms/44x/currituck.c +++ b/arch/powerpc/platforms/44x/currituck.c @@ -91,12 +91,12 @@ static void __init ppc47x_init_irq(void) } #ifdef CONFIG_SMP -static void __cpuinit smp_ppc47x_setup_cpu(int cpu) +static void smp_ppc47x_setup_cpu(int cpu) { mpic_setup_this_cpu(); } -static int __cpuinit smp_ppc47x_kick_cpu(int cpu) +static int smp_ppc47x_kick_cpu(int cpu) { struct device_node *cpunode = of_get_cpu_node(cpu, NULL); const u64 *spin_table_addr_prop; diff --git a/arch/powerpc/platforms/44x/iss4xx.c b/arch/powerpc/platforms/44x/iss4xx.c index a28a8629727e..4241bc825800 100644 --- a/arch/powerpc/platforms/44x/iss4xx.c +++ b/arch/powerpc/platforms/44x/iss4xx.c @@ -81,12 +81,12 @@ static void __init iss4xx_init_irq(void) } #ifdef CONFIG_SMP -static void __cpuinit smp_iss4xx_setup_cpu(int cpu) +static void smp_iss4xx_setup_cpu(int cpu) { mpic_setup_this_cpu(); } -static int __cpuinit smp_iss4xx_kick_cpu(int cpu) +static int smp_iss4xx_kick_cpu(int cpu) { struct device_node *cpunode = of_get_cpu_node(cpu, NULL); const u64 *spin_table_addr_prop; diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 6a1759939c6b..5ced4f5bb2b2 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -99,7 +99,7 @@ static void mpc85xx_take_timebase(void) } #ifdef CONFIG_HOTPLUG_CPU -static void __cpuinit smp_85xx_mach_cpu_die(void) +static void smp_85xx_mach_cpu_die(void) { unsigned int cpu = smp_processor_id(); u32 tmp; @@ -141,7 +141,7 @@ static inline u32 read_spin_table_addr_l(void *spin_table) return in_be32(&((struct epapr_spin_table *)spin_table)->addr_l); } -static int __cpuinit smp_85xx_kick_cpu(int nr) +static int smp_85xx_kick_cpu(int nr) { unsigned long flags; const u64 *cpu_rel_addr; @@ -362,7 +362,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image) } #endif /* CONFIG_KEXEC */ -static void __cpuinit smp_85xx_setup_cpu(int cpu_nr) +static void smp_85xx_setup_cpu(int cpu_nr) { if (smp_85xx_ops.probe == smp_mpic_probe) mpic_setup_this_cpu(); diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index bdb738a69e41..49c9f9501c21 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -885,7 +885,7 @@ static int smp_core99_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata smp_core99_cpu_nb = { +static struct notifier_block smp_core99_cpu_nb = { .notifier_call = smp_core99_cpu_notify, }; #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 77784aead1c2..89e3857af4e0 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -40,7 +40,7 @@ #define DBG(fmt...) #endif -static void __cpuinit pnv_smp_setup_cpu(int cpu) +static void pnv_smp_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); From 330dae19998072e97f550885a0087df6d6556816 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 30 Jun 2013 12:03:23 +0200 Subject: [PATCH 122/156] mac: Make cuda_init_via() __init cuda_init_via() is called from find_via_cuda() only, which is __init. Signed-off-by: Geert Uytterhoeven Signed-off-by: Benjamin Herrenschmidt --- drivers/macintosh/via-cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c index 86511c570dd8..d61f271d2207 100644 --- a/drivers/macintosh/via-cuda.c +++ b/drivers/macintosh/via-cuda.c @@ -259,7 +259,7 @@ cuda_probe(void) } while (0) static int -cuda_init_via(void) +__init cuda_init_via(void) { out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ); /* TACK & TIP out */ out_8(&via[B], in_8(&via[B]) | TACK | TIP); /* negate them */ From cce606feb425093c8371089d392e336d186e125b Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Thu, 16 May 2013 18:20:26 +0800 Subject: [PATCH 123/156] powerpc: Set cpu sibling mask before online cpu It seems following race is possible: cpu0 cpux smp_init->cpu_up->_cpu_up __cpu_up kick_cpu(1) ------------------------------------------------------------------------- waiting online ... ... notify CPU_STARTING set cpux active set cpux online ------------------------------------------------------------------------- finish waiting online ... sched_init_smp init_sched_domains(cpu_active_mask) build_sched_domains set cpux sibling info ------------------------------------------------------------------------- Execution of cpu0 and cpux could be concurrent between two separator lines. So if the cpux sibling information was set too late (normally impossible, but could be triggered by adding some delay in start_secondary, after setting cpu online), build_sched_domains() running on cpu0 might see cpux active, with an empty sibling mask, then cause some bad address accessing like following: [ 0.099855] Unable to handle kernel paging request for data at address 0xc00000038518078f [ 0.099868] Faulting instruction address: 0xc0000000000b7a64 [ 0.099883] Oops: Kernel access of bad area, sig: 11 [#1] [ 0.099895] PREEMPT SMP NR_CPUS=16 DEBUG_PAGEALLOC NUMA pSeries [ 0.099922] Modules linked in: [ 0.099940] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.10.0-rc1-00120-gb973425-dirty #16 [ 0.099956] task: c0000001fed80000 ti: c0000001fed7c000 task.ti: c0000001fed7c000 [ 0.099971] NIP: c0000000000b7a64 LR: c0000000000b7a40 CTR: c0000000000b4934 [ 0.099985] REGS: c0000001fed7f760 TRAP: 0300 Not tainted (3.10.0-rc1-00120-gb973425-dirty) [ 0.099997] MSR: 8000000000009032 CR: 24272828 XER: 20000003 [ 0.100045] SOFTE: 1 [ 0.100053] CFAR: c000000000445ee8 [ 0.100064] DAR: c00000038518078f, DSISR: 40000000 [ 0.100073] GPR00: 0000000000000080 c0000001fed7f9e0 c000000000c84d48 0000000000000010 GPR04: 0000000000000010 0000000000000000 c0000001fc55e090 0000000000000000 GPR08: ffffffffffffffff c000000000b80b30 c000000000c962d8 00000003845ffc5f GPR12: 0000000000000000 c00000000f33d000 c00000000000b9e4 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000001 0000000000000000 GPR20: c000000000ccf750 0000000000000000 c000000000c94d48 c0000001fc504000 GPR24: c0000001fc504000 c0000001fecef848 c000000000c94d48 c000000000ccf000 GPR28: c0000001fc522090 0000000000000010 c0000001fecef848 c0000001fed7fae0 [ 0.100293] NIP [c0000000000b7a64] .get_group+0x84/0xc4 [ 0.100307] LR [c0000000000b7a40] .get_group+0x60/0xc4 [ 0.100318] Call Trace: [ 0.100332] [c0000001fed7f9e0] [c0000000000dbce4] .lock_is_held+0xa8/0xd0 (unreliable) [ 0.100354] [c0000001fed7fa70] [c0000000000bf62c] .build_sched_domains+0x728/0xd14 [ 0.100375] [c0000001fed7fbe0] [c000000000af67bc] .sched_init_smp+0x4fc/0x654 [ 0.100394] [c0000001fed7fce0] [c000000000adce24] .kernel_init_freeable+0x17c/0x30c [ 0.100413] [c0000001fed7fdb0] [c00000000000ba08] .kernel_init+0x24/0x12c [ 0.100431] [c0000001fed7fe30] [c000000000009f74] .ret_from_kernel_thread+0x5c/0x68 [ 0.100445] Instruction dump: [ 0.100456] 38800010 38a00000 4838e3f5 60000000 7c6307b4 2fbf0000 419e0040 3d220001 [ 0.100496] 78601f24 39491590 e93e0008 7d6a002a <7d69582a> f97f0000 7d4a002a e93e0010 [ 0.100559] ---[ end trace 31fd0ba7d8756001 ]--- This patch tries to move the sibling maps updating before notify_cpu_starting() and cpu online, and a write barrier there to make sure sibling maps are updated before active and online mask. Signed-off-by: Li Zhong Reviewed-by: Srivatsa S. Bhat Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/smp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 85398c71665c..38b0ba65a735 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -637,12 +637,10 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif - notify_cpu_starting(cpu); - set_cpu_online(cpu, true); /* Update sibling maps */ base = cpu_first_thread_sibling(cpu); for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i)) + if (cpu_is_offline(base + i) && (cpu != base + i)) continue; cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); @@ -667,6 +665,10 @@ void start_secondary(void *unused) } of_node_put(l2_cache); + smp_wmb(); + notify_cpu_starting(cpu); + set_cpu_online(cpu, true); + local_irq_enable(); cpu_startup_entry(CPUHP_ONLINE); From 7029705a9d0544186c29ae09708b3e5adb512835 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 21 May 2013 17:20:50 +0800 Subject: [PATCH 124/156] powerpc/nvram64: Need return the related error code on failure occurs When error occurs, need return the related error code to let upper caller know about it. ppc_md.nvram_size() can return the error code (e.g. core99_nvram_size() in 'arch/powerpc/platforms/powermac/nvram.c'). Also set ret value when only need it, so can save structions for normal cases. Signed-off-by: Chen Gang Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/nvram_64.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 48fbc2b97e95..8213ee1eb05a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf, char *tmp = NULL; ssize_t size; - ret = -ENODEV; - if (!ppc_md.nvram_size) + if (!ppc_md.nvram_size) { + ret = -ENODEV; goto out; + } - ret = 0; size = ppc_md.nvram_size(); - if (*ppos >= size || size < 0) + if (size < 0) { + ret = size; goto out; + } + + if (*ppos >= size) { + ret = 0; + goto out; + } count = min_t(size_t, count, size - *ppos); count = min(count, PAGE_SIZE); - ret = -ENOMEM; tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) + if (!tmp) { + ret = -ENOMEM; goto out; + } ret = ppc_md.nvram_read(tmp, count, ppos); if (ret <= 0) From 91f5af2e6524f795c0ee5d2fdc3be40d0e427ae2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 21 May 2013 20:45:40 +0200 Subject: [PATCH 125/156] macintosh/windfarm: Remove obsolete cleanup for clientdata A few new i2c-drivers came into the kernel which clear the clientdata-pointer on exit or error. This is obsolete meanwhile, the core will do it. Signed-off-by: Wolfram Sang Signed-off-by: Benjamin Herrenschmidt --- drivers/macintosh/windfarm_smu_sat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c index d87f5ee04ca9..ad6223e88340 100644 --- a/drivers/macintosh/windfarm_smu_sat.c +++ b/drivers/macintosh/windfarm_smu_sat.c @@ -343,7 +343,6 @@ static int wf_sat_remove(struct i2c_client *client) wf_unregister_sensor(&sens->sens); } sat->i2c = NULL; - i2c_set_clientdata(client, NULL); kref_put(&sat->ref, wf_sat_release); return 0; From 8246aca7058f3f2c2ae503081777965cd8df7b90 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 20 Mar 2013 14:30:12 +0800 Subject: [PATCH 126/156] powerpc/smp: Section mismatch from smp_release_cpus to __initdata spinning_secondaries the smp_release_cpus is a normal funciton and called in normal environments, but it calls the __initdata spinning_secondaries. need modify spinning_secondaries to match smp_release_cpus. the related warning: (the linker report boot_paca.33377, but it should be spinning_secondaries) ----------------------------------------------------------------------------- WARNING: arch/powerpc/kernel/built-in.o(.text+0x23176): Section mismatch in reference from the function .smp_release_cpus() to the variable .init.data:boot_paca.33377 The function .smp_release_cpus() references the variable __initdata boot_paca.33377. This is often because .smp_release_cpus lacks a __initdata annotation or the annotation of boot_paca.33377 is wrong. WARNING: arch/powerpc/kernel/built-in.o(.text+0x231fe): Section mismatch in reference from the function .smp_release_cpus() to the variable .init.data:boot_paca.33377 The function .smp_release_cpus() references the variable __initdata boot_paca.33377. This is often because .smp_release_cpus lacks a __initdata annotation or the annotation of boot_paca.33377 is wrong. ----------------------------------------------------------------------------- Signed-off-by: Chen Gang CC: Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e379d3fd1694..389fb8077cc9 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -76,7 +76,7 @@ #endif int boot_cpuid = 0; -int __initdata spinning_secondaries; +int spinning_secondaries; u64 ppc64_pft_size; /* Pick defaults since we might want to patch instructions From ec207dcc91c62aeccb65b36c7764076f5e5aaddb Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 28 Jun 2013 21:12:14 +0800 Subject: [PATCH 127/156] powerpc/eeh: Update MAINTAINERS Update MAINTAINERS to reflect recent changes. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- MAINTAINERS | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5be702cc8449..ebeceeb9dfab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3103,6 +3103,13 @@ M: Maxim Levitsky S: Maintained F: drivers/media/rc/ene_ir.* +ENHANCED ERROR HANDLING (EEH) +M: Gavin Shan +L: linuxppc-dev@lists.ozlabs.org +S: Supported +F: Documentation/powerpc/eeh-pci-error-recovery.txt +F: arch/powerpc/kernel/eeh*.c + EPSON S1D13XXX FRAMEBUFFER DRIVER M: Kristoffer Ericson S: Maintained @@ -6149,7 +6156,6 @@ M: Linas Vepstas L: linux-pci@vger.kernel.org S: Supported F: Documentation/PCI/pci-error-recovery.txt -F: Documentation/powerpc/eeh-pci-error-recovery.txt PCI SUBSYSTEM M: Bjorn Helgaas From dd023217e17e72b46fb4d49c7734c426938c3dba Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 24 Jun 2013 22:08:05 -0500 Subject: [PATCH 128/156] powerpc/numa: Do not update sysfs cpu registration from invalid context The topology update code that updates the cpu node registration in sysfs should not be called while in stop_machine(). The register/unregister calls take a lock and may sleep. This patch moves these calls outside of the call to stop_machine(). Signed-off-by: Nathan Fontenot CC: Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/numa.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index c792cd95e792..08397217e8ac 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1432,11 +1432,9 @@ static int update_cpu_topology(void *data) if (cpu != update->cpu) continue; - unregister_cpu_under_node(update->cpu, update->old_nid); unmap_cpu_from_node(update->cpu); map_cpu_to_node(update->cpu, update->new_nid); vdso_getcpu_init(); - register_cpu_under_node(update->cpu, update->new_nid); } return 0; @@ -1484,6 +1482,9 @@ int arch_update_cpu_topology(void) stop_machine(update_cpu_topology, &updates[0], &updated_cpus); for (ud = &updates[0]; ud; ud = ud->next) { + unregister_cpu_under_node(ud->cpu, ud->old_nid); + register_cpu_under_node(ud->cpu, ud->new_nid); + dev = get_cpu_device(ud->cpu); if (dev) kobject_uevent(&dev->kobj, KOBJ_CHANGE); From 1d567cb4bd42d560a7621cac6f6aebe87343689e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 25 Jun 2013 17:47:54 +1000 Subject: [PATCH 129/156] powerpc: Remove unreachable relocation on exception handlers We have relocation on exception handlers defined for h_data_storage and h_instr_storage. However we will never take relocation on exceptions for these because they can only come from a guest, and we never take relocation on exceptions when we transition from guest to host. We also have a handler for hmi_exception (Hypervisor Maintenance) which is defined in the architecture to never be delivered with relocation on, see see v2.07 Book III-S section 6.5. So remove the handlers, leaving a branch to self just to be double extra paranoid. Signed-off-by: Michael Ellerman CC: [v3.9+] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e783453f910d..dcadf03814e1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -793,14 +793,10 @@ system_call_relon_pSeries: STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step) . = 0x4e00 - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b h_data_storage_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e20 - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b h_instr_storage_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e40 SET_SCRATCH0(r13) @@ -808,9 +804,7 @@ system_call_relon_pSeries: b emulation_assist_relon_hv . = 0x4e60 - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b hmi_exception_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e80 SET_SCRATCH0(r13) @@ -1180,14 +1174,8 @@ tm_unavailable_common: __end_handlers: /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV_OOL(0xe00, h_data_storage) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00) - STD_RELON_EXCEPTION_HV_OOL(0xe20, h_instr_storage) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20) STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40) - STD_RELON_EXCEPTION_HV_OOL(0xe60, hmi_exception) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80) From c9f69518e5f08170bc857984a077f693d63171df Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 25 Jun 2013 17:47:55 +1000 Subject: [PATCH 130/156] powerpc: Remove KVMTEST from RELON exception handlers KVMTEST is a macro which checks whether we are taking an exception from guest context, if so we branch out of line and eventually call into the KVM code to handle the switch. When running real guests on bare metal (HV KVM) the hardware ensures that we never take a relocation on exception when transitioning from guest to host. For PR KVM we disable relocation on exceptions ourself in kvmppc_core_init_vm(), as of commit a413f47 "Disable relocation on exceptions whenever PR KVM is active". So convert all the RELON macros to use NOTEST, and drop the remaining KVM_HANDLER() definitions we have for 0xe40 and 0xe80. Signed-off-by: Michael Ellerman CC: [v3.9+] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64s.h | 8 ++++---- arch/powerpc/kernel/exceptions-64s.S | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 8e5fae8beaf6..484cb094ad42 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -358,12 +358,12 @@ label##_relon_pSeries: \ /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13 */ \ EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ - EXC_STD, KVMTEST_PR, vec) + EXC_STD, NOTEST, vec) #define STD_RELON_EXCEPTION_PSERIES_OOL(vec, label) \ .globl label##_relon_pSeries; \ label##_relon_pSeries: \ - EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \ EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_STD) #define STD_RELON_EXCEPTION_HV(loc, vec, label) \ @@ -374,12 +374,12 @@ label##_relon_hv: \ /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13 */ \ EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ - EXC_HV, KVMTEST, vec) + EXC_HV, NOTEST, vec) #define STD_RELON_EXCEPTION_HV_OOL(vec, label) \ .globl label##_relon_hv; \ label##_relon_hv: \ - EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \ EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_HV) /* This associate vector numbers with bits in paca->irq_happened */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index dcadf03814e1..89eba1fb5bbe 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1175,9 +1175,7 @@ __end_handlers: /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80) STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) From 021424a1fce335e05807fd770eb8e1da30a63eea Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 25 Jun 2013 17:47:56 +1000 Subject: [PATCH 131/156] powerpc: Rename and flesh out the facility unavailable exception handler The exception at 0xf60 is not the TM (Transactional Memory) unavailable exception, it is the "Facility Unavailable Exception", rename it as such. Flesh out the handler to acknowledge the fact that it can be called for many reasons, one of which is TM being unavailable. Use STD_EXCEPTION_COMMON() for the exception body, for some reason we had it open-coded, I've checked the generated code is identical. Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 21 ++++++------------ arch/powerpc/kernel/traps.c | 33 +++++++++++++++++++++------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 89eba1fb5bbe..6ee74f8bae2f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -341,10 +341,11 @@ vsx_unavailable_pSeries_1: EXCEPTION_PROLOG_0(PACA_EXGEN) b vsx_unavailable_pSeries +facility_unavailable_trampoline: . = 0xf60 SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXGEN) - b tm_unavailable_pSeries + b facility_unavailable_pSeries #ifdef CONFIG_CBE_RAS STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) @@ -522,7 +523,7 @@ denorm_done: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20) STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) - STD_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable) + STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60) /* @@ -829,11 +830,11 @@ vsx_unavailable_relon_pSeries_1: EXCEPTION_PROLOG_0(PACA_EXGEN) b vsx_unavailable_relon_pSeries -tm_unavailable_relon_pSeries_1: +facility_unavailable_relon_trampoline: . = 0x4f60 SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXGEN) - b tm_unavailable_relon_pSeries + b facility_unavailable_relon_pSeries STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint) #ifdef CONFIG_PPC_DENORMALISATION @@ -1159,15 +1160,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) bl .vsx_unavailable_exception b .ret_from_except - .align 7 - .globl tm_unavailable_common -tm_unavailable_common: - EXCEPTION_PROLOG_COMMON(0xf60, PACA_EXGEN) - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .tm_unavailable_exception - b .ret_from_except + STD_EXCEPTION_COMMON(0xf60, facility_unavailable, .facility_unavailable_exception) .align 7 .globl __end_handlers @@ -1180,7 +1173,7 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) - STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 300daf3e7ab0..11f448812dc1 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1286,25 +1286,42 @@ void vsx_unavailable_exception(struct pt_regs *regs) die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT); } -void tm_unavailable_exception(struct pt_regs *regs) +void facility_unavailable_exception(struct pt_regs *regs) { + static char *facility_strings[] = { + "FPU", + "VMX/VSX", + "DSCR", + "PMU SPRs", + "BHRB", + "TM", + "AT", + "EBB", + "TAR", + }; + char *facility; + u64 value; + + value = mfspr(SPRN_FSCR) >> 56; + /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) local_irq_enable(); - /* Currently we never expect a TMU exception. Catch - * this and kill the process! - */ - printk(KERN_EMERG "Unexpected TM unavailable exception at %lx " - "(msr %lx)\n", - regs->nip, regs->msr); + if (value < ARRAY_SIZE(facility_strings)) + facility = facility_strings[value]; + else + facility = "unknown"; + + pr_err("Facility '%s' unavailable, exception at 0x%lx, MSR=%lx\n", + facility, regs->nip, regs->msr); if (user_mode(regs)) { _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); return; } - die("Unexpected TM unavailable exception", regs, SIGABRT); + die("Unexpected facility unavailable exception", regs, SIGABRT); } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM From b14b6260efeee6eb8942c6e6420e31281892acb6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 25 Jun 2013 17:47:57 +1000 Subject: [PATCH 132/156] powerpc: Wire up the HV facility unavailable exception Similar to the facility unavailble exception, except the facilities are controlled by HFSCR. Adapt the facility_unavailable_exception() so it can be called for either the regular or Hypervisor facility unavailable exceptions. Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 15 +++++++++++++++ arch/powerpc/kernel/traps.c | 16 ++++++++++++---- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6ee74f8bae2f..359d949f4a4b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -347,6 +347,12 @@ facility_unavailable_trampoline: EXCEPTION_PROLOG_0(PACA_EXGEN) b facility_unavailable_pSeries +hv_facility_unavailable_trampoline: + . = 0xf80 + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_hv + #ifdef CONFIG_CBE_RAS STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202) @@ -525,6 +531,8 @@ denorm_done: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60) + STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82) /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -836,6 +844,12 @@ facility_unavailable_relon_trampoline: EXCEPTION_PROLOG_0(PACA_EXGEN) b facility_unavailable_relon_pSeries +hv_facility_unavailable_relon_trampoline: + . = 0x4f80 + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_relon_hv + STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint) #ifdef CONFIG_PPC_DENORMALISATION . = 0x5500 @@ -1174,6 +1188,7 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, facility_unavailable) #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11f448812dc1..4a87fcbe3ba9 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1299,10 +1299,18 @@ void facility_unavailable_exception(struct pt_regs *regs) "EBB", "TAR", }; - char *facility; + char *facility, *prefix; u64 value; - value = mfspr(SPRN_FSCR) >> 56; + if (regs->trap == 0xf60) { + value = mfspr(SPRN_FSCR); + prefix = ""; + } else { + value = mfspr(SPRN_HFSCR); + prefix = "Hypervisor "; + } + + value = value >> 56; /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) @@ -1313,8 +1321,8 @@ void facility_unavailable_exception(struct pt_regs *regs) else facility = "unknown"; - pr_err("Facility '%s' unavailable, exception at 0x%lx, MSR=%lx\n", - facility, regs->nip, regs->msr); + pr_err("%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n", + prefix, facility, regs->nip, regs->msr); if (user_mode(regs)) { _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); From d8bec4c9cd58f6d3679e09b7293851fb92ad7557 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:10 +1000 Subject: [PATCH 133/156] powerpc/perf: Check that events only include valid bits on Power8 A mistake we have made in the past is that we pull out the fields we need from the event code, but don't check that there are no unknown bits set. This means that we can't ever assign meaning to those unknown bits in future. Although we have once again failed to do this at release, it is still early days for Power8 so I think we can still slip this in and get away with it. Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/perf/power8-pmu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index f7d1c4fff303..84cdc6d892e3 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -109,6 +109,16 @@ #define EVENT_IS_MARKED (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) #define EVENT_PSEL_MASK 0xff /* PMCxSEL value */ +#define EVENT_VALID_MASK \ + ((EVENT_THRESH_MASK << EVENT_THRESH_SHIFT) | \ + (EVENT_SAMPLE_MASK << EVENT_SAMPLE_SHIFT) | \ + (EVENT_CACHE_SEL_MASK << EVENT_CACHE_SEL_SHIFT) | \ + (EVENT_PMC_MASK << EVENT_PMC_SHIFT) | \ + (EVENT_UNIT_MASK << EVENT_UNIT_SHIFT) | \ + (EVENT_COMBINE_MASK << EVENT_COMBINE_SHIFT) | \ + (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) | \ + EVENT_PSEL_MASK) + /* MMCRA IFM bits - POWER8 */ #define POWER8_MMCRA_IFM1 0x0000000040000000UL #define POWER8_MMCRA_IFM2 0x0000000080000000UL @@ -212,6 +222,9 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long mask = value = 0; + if (event & ~EVENT_VALID_MASK) + return -1; + pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK; From 378a6ee99e4a431ec84e4e61893445c041c93007 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:11 +1000 Subject: [PATCH 134/156] powerpc/perf: Rework disable logic in pmu_disable() In pmu_disable() we disable the PMU by setting the FC (Freeze Counters) bit in MMCR0. In order to do this we have to read/modify/write MMCR0. It's possible that we read a value from MMCR0 which has PMAO (PMU Alert Occurred) set. When we write that value back it will cause an interrupt to occur. We will then end up in the PMU interrupt handler even though we are supposed to have just disabled the PMU. We can avoid this by making sure we never write PMAO back. We should not lose interrupts because when the PMU is re-enabled the overflowed values will cause another interrupt. We also reorder the clearing of SAMPLE_ENABLE so that is done after the PMU is frozen. Otherwise there is a small window between the clearing of SAMPLE_ENABLE and the setting of FC where we could take an interrupt and incorrectly see SAMPLE_ENABLE not set. This would for example change the logic in perf_read_regs(). Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/perf/core-book3s.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index af94a717c451..5d502bf374ea 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -75,6 +75,7 @@ static unsigned int freeze_events_kernel = MMCR0_FCS; #define MMCR0_FCHV 0 #define MMCR0_PMCjCE MMCR0_PMCnCE +#define MMCR0_PMAO 0 #define SPRN_MMCRA SPRN_MMCR2 #define MMCRA_SAMPLE_ENABLE 0 @@ -852,7 +853,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) static void power_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; - unsigned long flags; + unsigned long flags, val; if (!ppmu) return; @@ -860,9 +861,6 @@ static void power_pmu_disable(struct pmu *pmu) cpuhw = &__get_cpu_var(cpu_hw_events); if (!cpuhw->disabled) { - cpuhw->disabled = 1; - cpuhw->n_added = 0; - /* * Check if we ever enabled the PMU on this cpu. */ @@ -871,6 +869,21 @@ static void power_pmu_disable(struct pmu *pmu) cpuhw->pmcs_enabled = 1; } + /* + * Set the 'freeze counters' bit, clear PMAO. + */ + val = mfspr(SPRN_MMCR0); + val |= MMCR0_FC; + val &= ~MMCR0_PMAO; + + /* + * The barrier is to make sure the mtspr has been + * executed and the PMU has frozen the events etc. + * before we return. + */ + write_mmcr0(cpuhw, val); + mb(); + /* * Disable instruction sampling if it was enabled */ @@ -880,14 +893,8 @@ static void power_pmu_disable(struct pmu *pmu) mb(); } - /* - * Set the 'freeze counters' bit. - * The barrier is to make sure the mtspr has been - * executed and the PMU has frozen the events - * before we return. - */ - write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); - mb(); + cpuhw->disabled = 1; + cpuhw->n_added = 0; } local_irq_restore(flags); } From 7a7a41f9d5b28ac3a916b057a7d3cd3f435ee9a6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:12 +1000 Subject: [PATCH 135/156] powerpc/perf: Freeze PMC5/6 if we're not using them On Power8 we can freeze PMC5 and 6 if we're not using them. Normally they run all the time. As noticed by Anshuman, we should unfreeze them when we disable the PMU as there are legacy tools which expect them to run all the time. Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/perf/core-book3s.c | 5 +++-- arch/powerpc/perf/power8-pmu.c | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 4a9e408644fe..362142b69d5b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -626,6 +626,7 @@ #define MMCR0_TRIGGER 0x00002000UL /* TRIGGER enable */ #define MMCR0_PMAO 0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */ #define MMCR0_SHRFC 0x00000040UL /* SHRre freeze conditions between threads */ +#define MMCR0_FC56 0x00000010UL /* freeze counters 5 and 6 */ #define MMCR0_FCTI 0x00000008UL /* freeze counters in tags inactive mode */ #define MMCR0_FCTA 0x00000004UL /* freeze counters in tags active mode */ #define MMCR0_FCWAIT 0x00000002UL /* freeze counter in WAIT state */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 5d502bf374ea..517a1350b09c 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -75,6 +75,7 @@ static unsigned int freeze_events_kernel = MMCR0_FCS; #define MMCR0_FCHV 0 #define MMCR0_PMCjCE MMCR0_PMCnCE +#define MMCR0_FC56 0 #define MMCR0_PMAO 0 #define SPRN_MMCRA SPRN_MMCR2 @@ -870,11 +871,11 @@ static void power_pmu_disable(struct pmu *pmu) } /* - * Set the 'freeze counters' bit, clear PMAO. + * Set the 'freeze counters' bit, clear PMAO/FC56. */ val = mfspr(SPRN_MMCR0); val |= MMCR0_FC; - val &= ~MMCR0_PMAO; + val &= ~(MMCR0_PMAO | MMCR0_FC56); /* * The barrier is to make sure the mtspr has been diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 84cdc6d892e3..d59f5b2d4c2f 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -391,6 +391,10 @@ static int power8_compute_mmcr(u64 event[], int n_ev, if (pmc_inuse & 0x7c) mmcr[0] |= MMCR0_PMCjCE; + /* If we're not using PMC 5 or 6, freeze them */ + if (!(pmc_inuse & 0x60)) + mmcr[0] |= MMCR0_FC56; + mmcr[1] = mmcr1; mmcr[2] = mmcra; From 0a48843d6c5114cfa4a9540ee4d6af87628cec01 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:13 +1000 Subject: [PATCH 136/156] powerpc/perf: Use existing out label in power_pmu_enable() In power_pmu_enable() we can use the existing out label to reduce the number of return paths. Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/perf/core-book3s.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 517a1350b09c..1bb26d586e3c 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -919,12 +919,13 @@ static void power_pmu_enable(struct pmu *pmu) if (!ppmu) return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); - if (!cpuhw->disabled) { - local_irq_restore(flags); - return; - } + if (!cpuhw->disabled) + goto out; + cpuhw->disabled = 0; /* From 4ea355b5368bde0574c12430df53334c4be3bdcf Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:14 +1000 Subject: [PATCH 137/156] powerpc/perf: Don't enable if we have zero events In power_pmu_enable() we still enable the PMU even if we have zero events. This should have no effect but doesn't make much sense. Instead just return after telling the hypervisor that we are not using the PMCs. Signed-off-by: Michael Ellerman CC: [v3.10] Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/perf/core-book3s.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1bb26d586e3c..c91dc43e04de 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -926,6 +926,11 @@ static void power_pmu_enable(struct pmu *pmu) if (!cpuhw->disabled) goto out; + if (cpuhw->n_events == 0) { + ppc_set_pmu_inuse(0); + goto out; + } + cpuhw->disabled = 0; /* @@ -937,8 +942,6 @@ static void power_pmu_enable(struct pmu *pmu) if (!cpuhw->n_added) { mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - if (cpuhw->n_events == 0) - ppc_set_pmu_inuse(0); goto out_enable; } From 2ac138ca21ad26c988ce7c91d27327f85beb7519 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:15 +1000 Subject: [PATCH 138/156] powerpc/perf: Drop MMCRA from thread_struct In commit 59affcd "Context switch more PMU related SPRs" I added more PMU SPRs to thread_struct, later modified in commit b11ae95. To add insult to injury it turns out we don't need to switch MMCRA as it's only user readable, and the value is recomputed by the PMU code. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/processor.h | 1 - arch/powerpc/kernel/asm-offsets.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 9fe1129efa02..3f19df3cc7a3 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -289,7 +289,6 @@ struct thread_struct { unsigned long sier; unsigned long mmcr0; unsigned long mmcr2; - unsigned long mmcra; #endif }; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index dc684b1af1c4..c7e8afc2ead0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -132,7 +132,6 @@ int main(void) DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier)); DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0)); DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2)); - DEFINE(THREAD_MMCRA, offsetof(struct thread_struct, mmcra)); #endif #ifdef CONFIG_PPC_TRANSACTIONAL_MEM DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch)); From 330a1eb7775ba876dbd46b9885556e57f705e3d4 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:16 +1000 Subject: [PATCH 139/156] powerpc/perf: Core EBB support for 64-bit book3s Add support for EBB (Event Based Branches) on 64-bit book3s. See the included documentation for more details. EBBs are a feature which allows the hardware to branch directly to a specified user space address when a PMU event overflows. This can be used by programs for self-monitoring with no kernel involvement in the inner loop. Most of the logic is in the generic book3s code, primarily to avoid a proliferation of PMU callbacks. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- Documentation/powerpc/00-INDEX | 2 + Documentation/powerpc/pmu-ebb.txt | 137 ++++++++++++++++ arch/powerpc/include/asm/perf_event_server.h | 6 + arch/powerpc/include/asm/processor.h | 3 +- arch/powerpc/include/asm/reg.h | 8 + arch/powerpc/include/asm/switch_to.h | 14 ++ arch/powerpc/kernel/process.c | 4 + arch/powerpc/perf/core-book3s.c | 161 +++++++++++++++++-- 8 files changed, 321 insertions(+), 14 deletions(-) create mode 100644 Documentation/powerpc/pmu-ebb.txt diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX index dd9e92802ec0..05026ce1875e 100644 --- a/Documentation/powerpc/00-INDEX +++ b/Documentation/powerpc/00-INDEX @@ -14,6 +14,8 @@ hvcs.txt - IBM "Hypervisor Virtual Console Server" Installation Guide mpc52xx.txt - Linux 2.6.x on MPC52xx family +pmu-ebb.txt + - Description of the API for using the PMU with Event Based Branches. qe_firmware.txt - describes the layout of firmware binaries for the Freescale QUICC Engine and the code that parses and uploads the microcode therein. diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt new file mode 100644 index 000000000000..73cd163dbfb8 --- /dev/null +++ b/Documentation/powerpc/pmu-ebb.txt @@ -0,0 +1,137 @@ +PMU Event Based Branches +======================== + +Event Based Branches (EBBs) are a feature which allows the hardware to +branch directly to a specified user space address when certain events occur. + +The full specification is available in Power ISA v2.07: + + https://www.power.org/documentation/power-isa-version-2-07/ + +One type of event for which EBBs can be configured is PMU exceptions. This +document describes the API for configuring the Power PMU to generate EBBs, +using the Linux perf_events API. + + +Terminology +----------- + +Throughout this document we will refer to an "EBB event" or "EBB events". This +just refers to a struct perf_event which has set the "EBB" flag in its +attr.config. All events which can be configured on the hardware PMU are +possible "EBB events". + + +Background +---------- + +When a PMU EBB occurs it is delivered to the currently running process. As such +EBBs can only sensibly be used by programs for self-monitoring. + +It is a feature of the perf_events API that events can be created on other +processes, subject to standard permission checks. This is also true of EBB +events, however unless the target process enables EBBs (via mtspr(BESCR)) no +EBBs will ever be delivered. + +This makes it possible for a process to enable EBBs for itself, but not +actually configure any events. At a later time another process can come along +and attach an EBB event to the process, which will then cause EBBs to be +delivered to the first process. It's not clear if this is actually useful. + + +When the PMU is configured for EBBs, all PMU interrupts are delivered to the +user process. This means once an EBB event is scheduled on the PMU, no non-EBB +events can be configured. This means that EBB events can not be run +concurrently with regular 'perf' commands, or any other perf events. + +It is however safe to run 'perf' commands on a process which is using EBBs. The +kernel will in general schedule the EBB event, and perf will be notified that +its events could not run. + +The exclusion between EBB events and regular events is implemented using the +existing "pinned" and "exclusive" attributes of perf_events. This means EBB +events will be given priority over other events, unless they are also pinned. +If an EBB event and a regular event are both pinned, then whichever is enabled +first will be scheduled and the other will be put in error state. See the +section below titled "Enabling an EBB event" for more information. + + +Creating an EBB event +--------------------- + +To request that an event is counted using EBB, the event code should have bit +63 set. + +EBB events must be created with a particular, and restrictive, set of +attributes - this is so that they interoperate correctly with the rest of the +perf_events subsystem. + +An EBB event must be created with the "pinned" and "exclusive" attributes set. +Note that if you are creating a group of EBB events, only the leader can have +these attributes set. + +An EBB event must NOT set any of the "inherit", "sample_period", "freq" or +"enable_on_exec" attributes. + +An EBB event must be attached to a task. This is specified to perf_event_open() +by passing a pid value, typically 0 indicating the current task. + +All events in a group must agree on whether they want EBB. That is all events +must request EBB, or none may request EBB. + +EBB events must specify the PMC they are to be counted on. This ensures +userspace is able to reliably determine which PMC the event is scheduled on. + + +Enabling an EBB event +--------------------- + +Once an EBB event has been successfully opened, it must be enabled with the +perf_events API. This can be achieved either via the ioctl() interface, or the +prctl() interface. + +However, due to the design of the perf_events API, enabling an event does not +guarantee that it has been scheduled on the PMU. To ensure that the EBB event +has been scheduled on the PMU, you must perform a read() on the event. If the +read() returns EOF, then the event has not been scheduled and EBBs are not +enabled. + +This behaviour occurs because the EBB event is pinned and exclusive. When the +EBB event is enabled it will force all other non-pinned events off the PMU. In +this case the enable will be successful. However if there is already an event +pinned on the PMU then the enable will not be successful. + + +Reading an EBB event +-------------------- + +It is possible to read() from an EBB event. However the results are +meaningless. Because interrupts are being delivered to the user process the +kernel is not able to count the event, and so will return a junk value. + + +Closing an EBB event +-------------------- + +When an EBB event is finished with, you can close it using close() as for any +regular event. If this is the last EBB event the PMU will be deconfigured and +no further PMU EBBs will be delivered. + + +EBB Handler +----------- + +The EBB handler is just regular userspace code, however it must be written in +the style of an interrupt handler. When the handler is entered all registers +are live (possibly) and so must be saved somehow before the handler can invoke +other code. + +It's up to the program how to handle this. For C programs a relatively simple +option is to create an interrupt frame on the stack and save registers there. + +Fork +---- + +EBB events are not inherited across fork. If the child process wishes to use +EBBs it should open a new event for itself. Similarly the EBB state in +BESCR/EBBHR/EBBRR is cleared across fork(). diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index f265049dd7d6..2dd7bfc459be 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -60,6 +60,7 @@ struct power_pmu { #define PPMU_HAS_SSLOT 0x00000020 /* Has sampled slot in MMCRA */ #define PPMU_HAS_SIER 0x00000040 /* Has SIER */ #define PPMU_BHRB 0x00000080 /* has BHRB feature enabled */ +#define PPMU_EBB 0x00000100 /* supports event based branch */ /* * Values for flags to get_alternatives() @@ -68,6 +69,11 @@ struct power_pmu { #define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ #define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ +/* + * We use the event config bit 63 as a flag to request EBB. + */ +#define EVENT_CONFIG_EBB_SHIFT 63 + extern int register_power_pmu(struct power_pmu *); struct pt_regs; diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 3f19df3cc7a3..47a35b08b963 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -287,8 +287,9 @@ struct thread_struct { unsigned long siar; unsigned long sdar; unsigned long sier; - unsigned long mmcr0; unsigned long mmcr2; + unsigned mmcr0; + unsigned used_ebb; #endif }; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 362142b69d5b..5d7d9c2a5473 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -621,6 +621,9 @@ #define MMCR0_PMXE 0x04000000UL /* performance monitor exception enable */ #define MMCR0_FCECE 0x02000000UL /* freeze ctrs on enabled cond or event */ #define MMCR0_TBEE 0x00400000UL /* time base exception enable */ +#define MMCR0_EBE 0x00100000UL /* Event based branch enable */ +#define MMCR0_PMCC 0x000c0000UL /* PMC control */ +#define MMCR0_PMCC_U6 0x00080000UL /* PMC1-6 are R/W by user (PR) */ #define MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/ #define MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/ #define MMCR0_TRIGGER 0x00002000UL /* TRIGGER enable */ @@ -674,6 +677,11 @@ #define SIER_SIAR_VALID 0x0400000 /* SIAR contents valid */ #define SIER_SDAR_VALID 0x0200000 /* SDAR contents valid */ +/* When EBB is enabled, some of MMCR0/MMCR2/SIER are user accessible */ +#define MMCR0_USER_MASK (MMCR0_FC | MMCR0_PMXE | MMCR0_PMAO) +#define MMCR2_USER_MASK 0x4020100804020000UL /* (FC1P|FC2P|FC3P|FC4P|FC5P|FC6P) */ +#define SIER_USER_MASK 0x7fffffUL + #define SPRN_PA6T_MMCR0 795 #define PA6T_MMCR0_EN0 0x0000000000000001UL #define PA6T_MMCR0_EN1 0x0000000000000002UL diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 200d763a0a67..49a13e0ef234 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -67,4 +67,18 @@ static inline void flush_spe_to_thread(struct task_struct *t) } #endif +static inline void clear_task_ebb(struct task_struct *t) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + /* EBB perf events are not inherited, so clear all EBB state. */ + t->thread.bescr = 0; + t->thread.mmcr2 = 0; + t->thread.mmcr0 = 0; + t->thread.siar = 0; + t->thread.sdar = 0; + t->thread.sier = 0; + t->thread.used_ebb = 0; +#endif +} + #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b0f3e3f77e72..f8a76e6207bd 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) flush_altivec_to_thread(src); flush_vsx_to_thread(src); flush_spe_to_thread(src); + *dst = *src; + + clear_task_ebb(dst); + return 0; } diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index c91dc43e04de..a3985aee77fe 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -77,6 +77,9 @@ static unsigned int freeze_events_kernel = MMCR0_FCS; #define MMCR0_PMCjCE MMCR0_PMCnCE #define MMCR0_FC56 0 #define MMCR0_PMAO 0 +#define MMCR0_EBE 0 +#define MMCR0_PMCC 0 +#define MMCR0_PMCC_U6 0 #define SPRN_MMCRA SPRN_MMCR2 #define MMCRA_SAMPLE_ENABLE 0 @@ -104,6 +107,15 @@ static inline int siar_valid(struct pt_regs *regs) return 1; } +static bool is_ebb_event(struct perf_event *event) { return false; } +static int ebb_event_check(struct perf_event *event) { return 0; } +static void ebb_event_add(struct perf_event *event) { } +static void ebb_switch_out(unsigned long mmcr0) { } +static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0) +{ + return mmcr0; +} + static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} void power_pmu_flush_branch_stack(void) {} @@ -464,6 +476,89 @@ void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) return; } +static bool is_ebb_event(struct perf_event *event) +{ + /* + * This could be a per-PMU callback, but we'd rather avoid the cost. We + * check that the PMU supports EBB, meaning those that don't can still + * use bit 63 of the event code for something else if they wish. + */ + return (ppmu->flags & PPMU_EBB) && + ((event->attr.config >> EVENT_CONFIG_EBB_SHIFT) & 1); +} + +static int ebb_event_check(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + /* Event and group leader must agree on EBB */ + if (is_ebb_event(leader) != is_ebb_event(event)) + return -EINVAL; + + if (is_ebb_event(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + return -EINVAL; + + if (!leader->attr.pinned || !leader->attr.exclusive) + return -EINVAL; + + if (event->attr.inherit || event->attr.sample_period || + event->attr.enable_on_exec || event->attr.freq) + return -EINVAL; + } + + return 0; +} + +static void ebb_event_add(struct perf_event *event) +{ + if (!is_ebb_event(event) || current->thread.used_ebb) + return; + + /* + * IFF this is the first time we've added an EBB event, set + * PMXE in the user MMCR0 so we can detect when it's cleared by + * userspace. We need this so that we can context switch while + * userspace is in the EBB handler (where PMXE is 0). + */ + current->thread.used_ebb = 1; + current->thread.mmcr0 |= MMCR0_PMXE; +} + +static void ebb_switch_out(unsigned long mmcr0) +{ + if (!(mmcr0 & MMCR0_EBE)) + return; + + current->thread.siar = mfspr(SPRN_SIAR); + current->thread.sier = mfspr(SPRN_SIER); + current->thread.sdar = mfspr(SPRN_SDAR); + current->thread.mmcr0 = mmcr0 & MMCR0_USER_MASK; + current->thread.mmcr2 = mfspr(SPRN_MMCR2) & MMCR2_USER_MASK; +} + +static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0) +{ + if (!ebb) + goto out; + + /* Enable EBB and read/write to all 6 PMCs for userspace */ + mmcr0 |= MMCR0_EBE | MMCR0_PMCC_U6; + + /* Add any bits from the user reg, FC or PMAO */ + mmcr0 |= current->thread.mmcr0; + + /* Be careful not to set PMXE if userspace had it cleared */ + if (!(current->thread.mmcr0 & MMCR0_PMXE)) + mmcr0 &= ~MMCR0_PMXE; + + mtspr(SPRN_SIAR, current->thread.siar); + mtspr(SPRN_SIER, current->thread.sier); + mtspr(SPRN_SDAR, current->thread.sdar); + mtspr(SPRN_MMCR2, current->thread.mmcr2); +out: + return mmcr0; +} #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -734,6 +829,13 @@ static void power_pmu_read(struct perf_event *event) if (!event->hw.idx) return; + + if (is_ebb_event(event)) { + val = read_pmc(event->hw.idx); + local64_set(&event->hw.prev_count, val); + return; + } + /* * Performance monitor interrupts come even when interrupts * are soft-disabled, as long as interrupts are hard-enabled. @@ -854,7 +956,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) static void power_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; - unsigned long flags, val; + unsigned long flags, mmcr0, val; if (!ppmu) return; @@ -871,11 +973,11 @@ static void power_pmu_disable(struct pmu *pmu) } /* - * Set the 'freeze counters' bit, clear PMAO/FC56. + * Set the 'freeze counters' bit, clear EBE/PMCC/PMAO/FC56. */ - val = mfspr(SPRN_MMCR0); + val = mmcr0 = mfspr(SPRN_MMCR0); val |= MMCR0_FC; - val &= ~(MMCR0_PMAO | MMCR0_FC56); + val &= ~(MMCR0_EBE | MMCR0_PMCC | MMCR0_PMAO | MMCR0_FC56); /* * The barrier is to make sure the mtspr has been @@ -896,7 +998,10 @@ static void power_pmu_disable(struct pmu *pmu) cpuhw->disabled = 1; cpuhw->n_added = 0; + + ebb_switch_out(mmcr0); } + local_irq_restore(flags); } @@ -911,15 +1016,15 @@ static void power_pmu_enable(struct pmu *pmu) struct cpu_hw_events *cpuhw; unsigned long flags; long i; - unsigned long val; + unsigned long val, mmcr0; s64 left; unsigned int hwc_index[MAX_HWEVENTS]; int n_lim; int idx; + bool ebb; if (!ppmu) return; - local_irq_save(flags); cpuhw = &__get_cpu_var(cpu_hw_events); @@ -933,6 +1038,13 @@ static void power_pmu_enable(struct pmu *pmu) cpuhw->disabled = 0; + /* + * EBB requires an exclusive group and all events must have the EBB + * flag set, or not set, so we can just check a single event. Also we + * know we have at least one event. + */ + ebb = is_ebb_event(cpuhw->event[0]); + /* * If we didn't change anything, or only removed events, * no need to recalculate MMCR* settings and reset the PMCs. @@ -1008,25 +1120,34 @@ static void power_pmu_enable(struct pmu *pmu) ++n_lim; continue; } - val = 0; - if (event->hw.sample_period) { - left = local64_read(&event->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; + + if (ebb) + val = local64_read(&event->hw.prev_count); + else { + val = 0; + if (event->hw.sample_period) { + left = local64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + local64_set(&event->hw.prev_count, val); } - local64_set(&event->hw.prev_count, val); + event->hw.idx = idx; if (event->hw.state & PERF_HES_STOPPED) val = 0; write_pmc(idx, val); + perf_event_update_userpage(event); } cpuhw->n_limited = n_lim; cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; out_enable: + mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]); + mb(); - write_mmcr0(cpuhw, cpuhw->mmcr[0]); + write_mmcr0(cpuhw, mmcr0); /* * Enable instruction sampling if necessary @@ -1124,6 +1245,8 @@ static int power_pmu_add(struct perf_event *event, int ef_flags) event->hw.config = cpuhw->events[n0]; nocheck: + ebb_event_add(event); + ++cpuhw->n_events; ++cpuhw->n_added; @@ -1484,6 +1607,11 @@ static int power_pmu_event_init(struct perf_event *event) } } + /* Extra checks for EBB */ + err = ebb_event_check(event); + if (err) + return err; + /* * If this is in a group, check if it can go on with all the * other hardware events in the group. We assume the event @@ -1522,6 +1650,13 @@ static int power_pmu_event_init(struct perf_event *event) event->hw.last_period = event->hw.sample_period; local64_set(&event->hw.period_left, event->hw.last_period); + /* + * For EBB events we just context switch the PMC value, we don't do any + * of the sample_period logic. We use hw.prev_count for this. + */ + if (is_ebb_event(event)) + local64_set(&event->hw.prev_count, 0); + /* * See if we need to reserve the PMU. * If no events are currently in use, then we have to take a From 4df489991182d3a9337c0a4b1563077c0004f1ba Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:17 +1000 Subject: [PATCH 140/156] powerpc/perf: Add power8 EBB support Add logic to the power8 PMU code to support EBB. Future processors would also be expected to implement similar constraints. At that time we could possibly factor these out into common code. Finally mark the power8 PMU as supporting EBB, which is the actual enable switch which allows EBBs to be configured. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/perf/power8-pmu.c | 45 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index d59f5b2d4c2f..96a64d6a8bdf 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -31,9 +31,9 @@ * * 60 56 52 48 44 40 36 32 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - * [ thresh_cmp ] [ thresh_ctl ] - * | - * thresh start/stop OR FAB match -* + * | [ thresh_cmp ] [ thresh_ctl ] + * | | + * *- EBB (Linux) thresh start/stop OR FAB match -* * * 28 24 20 16 12 8 4 0 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | @@ -85,6 +85,7 @@ * */ +#define EVENT_EBB_MASK 1ull #define EVENT_THR_CMP_SHIFT 40 /* Threshold CMP value */ #define EVENT_THR_CMP_MASK 0x3ff #define EVENT_THR_CTL_SHIFT 32 /* Threshold control value (start/stop) */ @@ -117,6 +118,7 @@ (EVENT_UNIT_MASK << EVENT_UNIT_SHIFT) | \ (EVENT_COMBINE_MASK << EVENT_COMBINE_SHIFT) | \ (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) | \ + (EVENT_EBB_MASK << EVENT_CONFIG_EBB_SHIFT) | \ EVENT_PSEL_MASK) /* MMCRA IFM bits - POWER8 */ @@ -140,10 +142,10 @@ * * 28 24 20 16 12 8 4 0 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - * [ ] [ sample ] [ ] [6] [5] [4] [3] [2] [1] - * | | - * L1 I/D qualifier -* | Count of events for each PMC. - * | p1, p2, p3, p4, p5, p6. + * | [ ] [ sample ] [ ] [6] [5] [4] [3] [2] [1] + * EBB -* | | + * | | Count of events for each PMC. + * L1 I/D qualifier -* | p1, p2, p3, p4, p5, p6. * nc - number of counters -* * * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints @@ -159,6 +161,9 @@ #define CNST_THRESH_VAL(v) (((v) & EVENT_THRESH_MASK) << 32) #define CNST_THRESH_MASK CNST_THRESH_VAL(EVENT_THRESH_MASK) +#define CNST_EBB_VAL(v) (((v) & EVENT_EBB_MASK) << 24) +#define CNST_EBB_MASK CNST_EBB_VAL(EVENT_EBB_MASK) + #define CNST_L1_QUAL_VAL(v) (((v) & 3) << 22) #define CNST_L1_QUAL_MASK CNST_L1_QUAL_VAL(3) @@ -217,7 +222,7 @@ static inline bool event_is_fab_match(u64 event) static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) { - unsigned int unit, pmc, cache; + unsigned int unit, pmc, cache, ebb; unsigned long mask, value; mask = value = 0; @@ -225,9 +230,13 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long if (event & ~EVENT_VALID_MASK) return -1; - pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; - unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; - cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK; + pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; + cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK; + ebb = (event >> EVENT_CONFIG_EBB_SHIFT) & EVENT_EBB_MASK; + + /* Clear the EBB bit in the event, so event checks work below */ + event &= ~(EVENT_EBB_MASK << EVENT_CONFIG_EBB_SHIFT); if (pmc) { if (pmc > 6) @@ -297,6 +306,18 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT); } + if (!pmc && ebb) + /* EBB events must specify the PMC */ + return -1; + + /* + * All events must agree on EBB, either all request it or none. + * EBB events are pinned & exclusive, so this should never actually + * hit, but we leave it as a fallback in case. + */ + mask |= CNST_EBB_VAL(ebb); + value |= CNST_EBB_MASK; + *maskp = mask; *valp = value; @@ -591,7 +612,7 @@ static struct power_pmu power8_pmu = { .get_constraint = power8_get_constraint, .get_alternatives = power8_get_alternatives, .disable_pmc = power8_disable_pmc, - .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB, + .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB | PPMU_EBB, .n_generic = ARRAY_SIZE(power8_generic_events), .generic_events = power8_generic_events, .attr_groups = power8_pmu_attr_groups, From 6e0b8bc965d25a8e0701eaca3fca5941b4f4b2b2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Jun 2013 18:15:18 +1000 Subject: [PATCH 141/156] powerpc/pseries: Inform the hypervisor we are using EBB regs On LPAR systems we need to inform the hypervisor that we are using the EBB registers. We do this by setting a bit in the Virtual Processor Area (VPA) - formerly known as the lppaca. For now we do this always, ie. we do not dynamically enable/disable. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/lppaca.h | 3 ++- arch/powerpc/platforms/pseries/lpar.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index b1e7f2af1016..9b12f88d4adb 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -66,7 +66,8 @@ struct lppaca { u8 reserved6[48]; u8 cede_latency_hint; - u8 reserved7[7]; + u8 ebb_regs_in_use; + u8 reserved7[6]; u8 dtl_enable_mask; /* Dispatch Trace Log mask */ u8 donate_dedicated_cpu; /* Donate dedicated CPU cycles */ u8 fpregs_in_use; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index fd0f2f2a9b90..02d6e21619bb 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -71,6 +71,9 @@ void vpa_init(int cpu) if (cpu_has_feature(CPU_FTR_ALTIVEC)) lppaca_of(cpu).vmxregs_in_use = 1; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + lppaca_of(cpu).ebb_regs_in_use = 1; + addr = __pa(&lppaca_of(cpu)); ret = register_vpa(hwcpu, addr); From 74251fe21bfa9310ddba9e0436d1fcf389e602ee Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 1 Jul 2013 17:54:09 +1000 Subject: [PATCH 142/156] powerpc/powernv: Fix iommu initialization again So because those things always end up in trainwrecks... In 7846de406 we moved back the iommu initialization earlier, essentially undoing 37f02195b which was causing us endless trouble... except that in the meantime we had merged 959c9bdd58 (to workaround the original breakage) which is now ... broken :-) This fixes it by doing a partial revert of the latter (we keep the ppc_md. path which will be needed in the hotplug case, which happens also during some EEH error recovery situations). Signed-off-by: Benjamin Herrenschmidt CC: [v3.10] --- arch/powerpc/platforms/powernv/pci-ioda.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c393bf59f113..49b57b9f835d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -443,6 +443,17 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev set_iommu_table_base(&pdev->dev, &pe->tce32_table); } +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, &pe->tce32_table); + if (dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, u64 *startp, u64 *endp) { @@ -599,6 +610,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, iommu_init_table(tbl, phb->hose->node); iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number); + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus); + return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ @@ -670,6 +686,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } iommu_init_table(tbl, phb->hose->node); + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus); + return; fail: if (pe->tce32_seg >= 0) From 6bbbca735936e15b9431882eceddcf6dff76e03c Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 27 Jun 2013 14:02:56 +0530 Subject: [PATCH 143/156] pstore: Pass header size in the pstore write callback Header size is needed to distinguish between header and the dump data. Incorporate the addition of new argument (hsize) in the pstore write callback. Signed-off-by: Aruna Balakrishnaiah Acked-by: Kees Cook Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 4 +++- drivers/acpi/apei/erst.c | 4 ++-- drivers/firmware/efi/efi-pstore.c | 2 +- fs/pstore/platform.c | 10 ++++++---- fs/pstore/ram.c | 3 ++- include/linux/pstore.h | 8 ++++---- 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 14cc486709f6..3f0e7d67d747 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -502,6 +502,7 @@ static int nvram_pstore_open(struct pstore_info *psi) * @part: pstore writes data to registered buffer in parts, * part number will indicate the same. * @count: Indicates oops count + * @hsize: Size of header added by pstore * @size: number of bytes written to the registered buffer * @psi: registered pstore_info structure * @@ -512,7 +513,8 @@ static int nvram_pstore_open(struct pstore_info *psi) static int nvram_pstore_write(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, int count, - size_t size, struct pstore_info *psi) + size_t hsize, size_t size, + struct pstore_info *psi) { int rc; struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 6d894bfd8b8f..a9cf96085f85 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -935,7 +935,7 @@ static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, char **buf, struct pstore_info *psi); static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason, - u64 *id, unsigned int part, int count, + u64 *id, unsigned int part, int count, size_t hsize, size_t size, struct pstore_info *psi); static int erst_clearer(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi); @@ -1055,7 +1055,7 @@ out: } static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason, - u64 *id, unsigned int part, int count, + u64 *id, unsigned int part, int count, size_t hsize, size_t size, struct pstore_info *psi) { struct cper_pstore_record *rcd = (struct cper_pstore_record *) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 202d2c85ba2e..452800e005b6 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -104,7 +104,7 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, static int efi_pstore_write(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, - unsigned int part, int count, size_t size, + unsigned int part, int count, size_t hsize, size_t size, struct pstore_info *psi) { char name[DUMP_NAME_LEN]; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 86d1038b5a12..4637ec4169cd 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, break; ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, hsize + len, psinfo); + oopscount, hsize, hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; @@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -221,9 +221,11 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, int count, - size_t size, struct pstore_info *psi) + size_t hsize, size_t size, + struct pstore_info *psi) { - return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); + return psi->write_buf(type, reason, id, part, psinfo->buf, hsize, + size, psi); } /* diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 1376e5a8f0d6..c6bb77ca35b5 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, - const char *buf, size_t size, + const char *buf, + size_t hsize, size_t size, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 656699fcc7d7..4aa80ba830a2 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -58,12 +58,12 @@ struct pstore_info { struct pstore_info *psi); int (*write)(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, - unsigned int part, int count, size_t size, - struct pstore_info *psi); + unsigned int part, int count, size_t hsize, + size_t size, struct pstore_info *psi); int (*write_buf)(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, - unsigned int part, const char *buf, size_t size, - struct pstore_info *psi); + unsigned int part, const char *buf, size_t hsize, + size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi); From fbfe86fc0c53911cf243479f7d26c37dcb1243f1 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 27 Jun 2013 14:03:04 +0530 Subject: [PATCH 144/156] powerpc/pseries: Re-organise the oops compression code nvram_compress() and zip_oops() is used by the nvram_pstore_write API to compress oops messages hence re-organise the functions accordingly to avoid forward declarations. Signed-off-by: Aruna Balakrishnaiah Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 104 ++++++++++++------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 3f0e7d67d747..588bab5da8cb 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -486,6 +486,58 @@ static int clobbering_unread_rtas_event(void) NVRAM_RTAS_READ_TIMEOUT); } +/* Derived from logfs_compress() */ +static int nvram_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Compress the text from big_oops_buf into oops_buf. */ +static int zip_oops(size_t text_len) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, + oops_data_sz); + if (zipped_len < 0) { + pr_err("nvram: compression failed; returned %d\n", zipped_len); + pr_err("nvram: logging uncompressed oops/panic report\n"); + return -1; + } + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) zipped_len; + oops_hdr->timestamp = get_seconds(); + return 0; +} + #ifdef CONFIG_PSTORE static int nvram_pstore_open(struct pstore_info *psi) { @@ -759,58 +811,6 @@ int __init pSeries_nvram_init(void) } -/* Derived from logfs_compress() */ -static int nvram_compress(const void *in, void *out, size_t inlen, - size_t outlen) -{ - int err, ret; - - ret = -EIO; - err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, - MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (err != Z_OK) - goto error; - - stream.next_in = in; - stream.avail_in = inlen; - stream.total_in = 0; - stream.next_out = out; - stream.avail_out = outlen; - stream.total_out = 0; - - err = zlib_deflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) - goto error; - - err = zlib_deflateEnd(&stream); - if (err != Z_OK) - goto error; - - if (stream.total_out >= stream.total_in) - goto error; - - ret = stream.total_out; -error: - return ret; -} - -/* Compress the text from big_oops_buf into oops_buf. */ -static int zip_oops(size_t text_len) -{ - struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; - int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, - oops_data_sz); - if (zipped_len < 0) { - pr_err("nvram: compression failed; returned %d\n", zipped_len); - pr_err("nvram: logging uncompressed oops/panic report\n"); - return -1; - } - oops_hdr->version = OOPS_HDR_VERSION; - oops_hdr->report_length = (u16) zipped_len; - oops_hdr->timestamp = get_seconds(); - return 0; -} - /* * This is our kmsg_dump callback, called after an oops or panic report * has been written to the printk buffer. We want to capture as much From 40847e56609c21692ebe593abbf72c1e7f4af206 Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Thu, 27 Jun 2013 14:03:13 +0530 Subject: [PATCH 145/156] powerpc/pseries: Support compression of oops text via pstore The patch set supports compression of oops messages while writing to NVRAM, this helps in capturing more of oops data to lnx,oops-log. The pstore file for oops messages will be in decompressed format making it readable. In case compression fails, the patch takes care of copying the header added by pstore and last oops_data_sz bytes of big_oops_buf to NVRAM so that we have recent oops messages in lnx,oops-log. In case decompression fails, it will result in absence of oops file but still have files (in /dev/pstore) for other partitions. Signed-off-by: Aruna Balakrishnaiah Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/nvram.c | 131 ++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 588bab5da8cb..9f8671a44551 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -539,6 +539,65 @@ static int zip_oops(size_t text_len) } #ifdef CONFIG_PSTORE +/* Derived from logfs_uncompress */ +int nvram_decompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +static int unzip_oops(char *oops_buf, char *big_buf) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + u64 timestamp = oops_hdr->timestamp; + char *big_oops_data = NULL; + char *oops_data_buf = NULL; + size_t big_oops_data_sz; + int unzipped_len; + + big_oops_data = big_buf + sizeof(struct oops_log_info); + big_oops_data_sz = big_oops_buf_sz - sizeof(struct oops_log_info); + oops_data_buf = oops_buf + sizeof(struct oops_log_info); + + unzipped_len = nvram_decompress(oops_data_buf, big_oops_data, + oops_hdr->report_length, + big_oops_data_sz); + + if (unzipped_len < 0) { + pr_err("nvram: decompression failed; returned %d\n", + unzipped_len); + return -1; + } + oops_hdr = (struct oops_log_info *)big_buf; + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) unzipped_len; + oops_hdr->timestamp = timestamp; + return 0; +} + static int nvram_pstore_open(struct pstore_info *psi) { /* Reset the iterator to start reading partitions again */ @@ -569,6 +628,7 @@ static int nvram_pstore_write(enum pstore_type_id type, struct pstore_info *psi) { int rc; + unsigned int err_type = ERR_TYPE_KERNEL_PANIC; struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; /* part 1 has the recent messages from printk buffer */ @@ -579,8 +639,30 @@ static int nvram_pstore_write(enum pstore_type_id type, oops_hdr->version = OOPS_HDR_VERSION; oops_hdr->report_length = (u16) size; oops_hdr->timestamp = get_seconds(); + + if (big_oops_buf) { + rc = zip_oops(size); + /* + * If compression fails copy recent log messages from + * big_oops_buf to oops_data. + */ + if (rc != 0) { + size_t diff = size - oops_data_sz + hsize; + + if (size > oops_data_sz) { + memcpy(oops_data, big_oops_buf, hsize); + memcpy(oops_data + hsize, big_oops_buf + diff, + oops_data_sz - hsize); + + oops_hdr->report_length = (u16) oops_data_sz; + } else + memcpy(oops_data, big_oops_buf, size); + } else + err_type = ERR_TYPE_KERNEL_PANIC_GZ; + } + rc = nvram_write_os_partition(&oops_log_partition, oops_buf, - (int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC, + (int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type, count); if (rc != 0) @@ -602,10 +684,11 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, struct oops_log_info *oops_hdr; unsigned int err_type, id_no, size = 0; struct nvram_os_partition *part = NULL; - char *buff = NULL; - int sig = 0; + char *buff = NULL, *big_buff = NULL; + int rc, sig = 0; loff_t p; +read_partition: read_type++; switch (nvram_type_ids[read_type]) { @@ -668,6 +751,25 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { oops_hdr = (struct oops_log_info *)buff; *buf = buff + sizeof(*oops_hdr); + + if (err_type == ERR_TYPE_KERNEL_PANIC_GZ) { + big_buff = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (!big_buff) + return -ENOMEM; + + rc = unzip_oops(buff, big_buff); + + if (rc != 0) { + kfree(buff); + kfree(big_buff); + goto read_partition; + } + + oops_hdr = (struct oops_log_info *)big_buff; + *buf = big_buff + sizeof(*oops_hdr); + kfree(buff); + } + time->tv_sec = oops_hdr->timestamp; time->tv_nsec = 0; return oops_hdr->report_length; @@ -689,17 +791,18 @@ static int nvram_pstore_init(void) { int rc = 0; - nvram_pstore_info.buf = oops_data; - nvram_pstore_info.bufsize = oops_data_sz; + if (big_oops_buf) { + nvram_pstore_info.buf = big_oops_buf; + nvram_pstore_info.bufsize = big_oops_buf_sz; + } else { + nvram_pstore_info.buf = oops_data; + nvram_pstore_info.bufsize = oops_data_sz; + } rc = pstore_register(&nvram_pstore_info); if (rc != 0) pr_err("nvram: pstore_register() failed, defaults to " "kmsg_dump; returned %d\n", rc); - else - /*TODO: Support compression when pstore is configured */ - pr_info("nvram: Compression of oops text supported only when " - "pstore is not configured"); return rc; } @@ -733,11 +836,6 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) oops_data = oops_buf + sizeof(struct oops_log_info); oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); - rc = nvram_pstore_init(); - - if (!rc) - return; - /* * Figure compression (preceded by elimination of each line's * severity prefix) will reduce the oops/panic report to at most @@ -761,6 +859,11 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) stream.workspace = NULL; } + rc = nvram_pstore_init(); + + if (!rc) + return; + rc = kmsg_dump_register(&nvram_kmsg_dumper); if (rc != 0) { pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); From e2a800beaca1f580945773e57d1a0e7cd37b1056 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 1 Jul 2013 14:19:50 +1000 Subject: [PATCH 146/156] powerpc/hw_brk: Fix off by one error when validating DAWR region end The Data Address Watchpoint Register (DAWR) on POWER8 can take a 512 byte range but this range must not cross a 512 byte boundary. Unfortunately we were off by one when calculating the end of the region, hence we were not allowing some breakpoint regions which were actually valid. This fixes this error. Signed-off-by: Michael Neuling Reported-by: Edjunior Barbosa Machado Cc: stable@vger.kernel.org # 3.9+ Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 1150ae7c22c3..f0b47d1a6b0e 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -176,7 +176,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) length_max = 512 ; /* 64 doublewords */ /* DAWR region can't cross 512 boundary */ if ((bp->attr.bp_addr >> 10) != - ((bp->attr.bp_addr + bp->attr.bp_len) >> 10)) + ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10)) return -EINVAL; } if (info->len > From c039e3a8ddd52139d0f81711ecd757772f868b22 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 2 Jul 2013 08:13:52 +1000 Subject: [PATCH 147/156] powerpc: Handle both new style and old style reserve maps When Jeremy introduced the new device-tree based reserve map, he made the code in early_reserve_mem_dt() bail out if it found one, thus not reserving the initrd nor processing the old style map. I hit problems with variants of kexec that didn't put the initrd in the new style map either. While these could/will be fixed, I believe we should be safe here and rather reserve more than not enough. We could have a firmware passing stuff via the new style map, and in the middle, a kexec that knows nothing about it and adding other things to the old style map. I don't see a big issue with processing both and reserving everything that needs to be. memblock_reserve() supports overlaps fine these days. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 9c753bc9885d..eb23ac92abb9 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -559,7 +559,7 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start, } #endif -static bool __init early_reserve_mem_dt(void) +static void __init early_reserve_mem_dt(void) { unsigned long i, len, dt_root; const __be32 *prop; @@ -569,7 +569,9 @@ static bool __init early_reserve_mem_dt(void) prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len); if (!prop) - return false; + return; + + DBG("Found new-style reserved-ranges\n"); /* Each reserved range is an (address,size) pair, 2 cells each, * totalling 4 cells per range. */ @@ -579,11 +581,11 @@ static bool __init early_reserve_mem_dt(void) base = of_read_number(prop + (i * 4) + 0, 2); size = of_read_number(prop + (i * 4) + 2, 2); - if (size) + if (size) { + DBG("reserving: %llx -> %llx\n", base, size); memblock_reserve(base, size); + } } - - return true; } static void __init early_reserve_mem(void) @@ -601,20 +603,16 @@ static void __init early_reserve_mem(void) self_size = initial_boot_params->totalsize; memblock_reserve(self_base, self_size); - /* - * Try looking for reserved-regions property in the DT first; if - * it's present, it'll contain all of the necessary reservation - * info - */ - if (early_reserve_mem_dt()) - return; + /* Look for the new "reserved-regions" property in the DT */ + early_reserve_mem_dt(); #ifdef CONFIG_BLK_DEV_INITRD - /* then reserve the initrd, if any */ - if (initrd_start && (initrd_end > initrd_start)) + /* Then reserve the initrd, if any */ + if (initrd_start && (initrd_end > initrd_start)) { memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), _ALIGN_UP(initrd_end, PAGE_SIZE) - _ALIGN_DOWN(initrd_start, PAGE_SIZE)); + } #endif /* CONFIG_BLK_DEV_INITRD */ #ifdef CONFIG_PPC32 @@ -626,6 +624,8 @@ static void __init early_reserve_mem(void) u32 base_32, size_32; u32 *reserve_map_32 = (u32 *)reserve_map; + DBG("Found old 32-bit reserve map\n"); + while (1) { base_32 = *(reserve_map_32++); size_32 = *(reserve_map_32++); @@ -640,6 +640,9 @@ static void __init early_reserve_mem(void) return; } #endif + DBG("Processing reserve map\n"); + + /* Handle the reserve map in the fdt blob if it exists */ while (1) { base = *(reserve_map++); size = *(reserve_map++); From 86d379690c3b005418fafc9afdcdfc731a043862 Mon Sep 17 00:00:00 2001 From: Hongtao Jia Date: Wed, 10 Apr 2013 10:52:55 +0800 Subject: [PATCH 148/156] powerpc/mpic: Add get_version API both for internal and external use MPIC version is useful information for both mpic_alloc() and mpic_init(). The patch provide an API to get MPIC version for reusing the code. Also, some other IP block may need MPIC version for their own use. The API for external use is also provided. Signed-off-by: Jia Hongtao Signed-off-by: Li Yang Signed-off-by: Scott Wood --- arch/powerpc/include/asm/mpic.h | 3 +++ arch/powerpc/sysdev/mpic.c | 32 +++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index c0f9ef90f0b8..ea6bf7220da9 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -393,6 +393,9 @@ struct mpic #define MPIC_REGSET_STANDARD MPIC_REGSET(0) /* Original MPIC */ #define MPIC_REGSET_TSI108 MPIC_REGSET(1) /* Tsi108/109 PIC */ +/* Get the version of primary MPIC */ +extern u32 fsl_mpic_primary_get_version(void); + /* Allocate the controller structure and setup the linux irq descs * for the range if interrupts passed in. No HW initialization is * actually performed. diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 3cc2f9159ab1..1a4e19c6a688 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1173,10 +1173,33 @@ static struct irq_domain_ops mpic_host_ops = { .xlate = mpic_host_xlate, }; +static u32 fsl_mpic_get_version(struct mpic *mpic) +{ + u32 brr1; + + if (!(mpic->flags & MPIC_FSL)) + return 0; + + brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs, + MPIC_FSL_BRR1); + + return brr1 & MPIC_FSL_BRR1_VER; +} + /* * Exported functions */ +u32 fsl_mpic_primary_get_version(void) +{ + struct mpic *mpic = mpic_primary; + + if (mpic) + return fsl_mpic_get_version(mpic); + + return 0; +} + struct mpic * __init mpic_alloc(struct device_node *node, phys_addr_t phys_addr, unsigned int flags, @@ -1323,7 +1346,6 @@ struct mpic * __init mpic_alloc(struct device_node *node, mpic_map(mpic, mpic->paddr, &mpic->tmregs, MPIC_INFO(TIMER_BASE), 0x1000); if (mpic->flags & MPIC_FSL) { - u32 brr1; int ret; /* @@ -1334,9 +1356,7 @@ struct mpic * __init mpic_alloc(struct device_node *node, mpic_map(mpic, mpic->paddr, &mpic->thiscpuregs, MPIC_CPU_THISBASE, 0x1000); - brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs, - MPIC_FSL_BRR1); - fsl_version = brr1 & MPIC_FSL_BRR1_VER; + fsl_version = fsl_mpic_get_version(mpic); /* Error interrupt mask register (EIMR) is required for * handling individual device error interrupts. EIMR @@ -1526,9 +1546,7 @@ void __init mpic_init(struct mpic *mpic) mpic_cpu_write(MPIC_INFO(CPU_CURRENT_TASK_PRI), 0xf); if (mpic->flags & MPIC_FSL) { - u32 brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs, - MPIC_FSL_BRR1); - u32 version = brr1 & MPIC_FSL_BRR1_VER; + u32 version = fsl_mpic_get_version(mpic); /* * Timer group B is present at the latest in MPIC 3.1 (e.g. From 2dd1c132d5c22677843658f5736f0de1cdf57bc1 Mon Sep 17 00:00:00 2001 From: Chunhe Lan Date: Fri, 19 Apr 2013 16:13:18 +0800 Subject: [PATCH 149/156] powerpc/fsl: Enable CONFIG_E1000E in mpc85xx_smp_defconfig On the most boards of Freescale platform, they use the PCI-Express Intel(R) PRO/1000 gigabit ethernet card to work. So enable the corresponding driver for it. Signed-off-by: Chunhe Lan Signed-off-by: Scott Wood --- arch/powerpc/configs/mpc85xx_smp_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/mpc85xx_smp_defconfig b/arch/powerpc/configs/mpc85xx_smp_defconfig index 165e6b32baef..152fa05b15e4 100644 --- a/arch/powerpc/configs/mpc85xx_smp_defconfig +++ b/arch/powerpc/configs/mpc85xx_smp_defconfig @@ -131,6 +131,7 @@ CONFIG_DUMMY=y CONFIG_FS_ENET=y CONFIG_UCC_GETH=y CONFIG_GIANFAR=y +CONFIG_E1000E=y CONFIG_MARVELL_PHY=y CONFIG_DAVICOM_PHY=y CONFIG_CICADA_PHY=y From 7601f59765a48bef329a429101eabcb0e2503634 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Thu, 18 Apr 2013 07:26:11 +0200 Subject: [PATCH 150/156] powerpc/8xx: Erroneous double irq_eoi() on CPM IRQ in MPC8xx irq_eoi() is already called by generic_handle_irq() so it shall not be called a again Signed-off-by: Christophe Leroy Signed-off-by: Scott Wood --- arch/powerpc/platforms/8xx/m8xx_setup.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index 806cbbd86ec6..587a2828b06c 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -219,19 +219,12 @@ void mpc8xx_restart(char *cmd) static void cpm_cascade(unsigned int irq, struct irq_desc *desc) { - struct irq_chip *chip; - int cascade_irq; - - if ((cascade_irq = cpm_get_irq()) >= 0) { - struct irq_desc *cdesc = irq_to_desc(cascade_irq); + struct irq_chip *chip = irq_desc_get_chip(desc); + int cascade_irq = cpm_get_irq(); + if (cascade_irq >= 0) generic_handle_irq(cascade_irq); - chip = irq_desc_get_chip(cdesc); - chip->irq_eoi(&cdesc->irq_data); - } - - chip = irq_desc_get_chip(desc); chip->irq_eoi(&desc->irq_data); } From 9837b43c5f3514e5d28f65f1513f4dc6759d2810 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 11 Apr 2013 09:32:34 +0800 Subject: [PATCH 151/156] powerpc/85xx: enable coreint for all the 64bit boards With the patch 7230c564 (powerpc: Rework lazy-interrupt handling), it seems that the coreint works pretty well on the 85xx 64bit kernel. So use the coreint by default for these boards. Signed-off-by: Kevin Hao Signed-off-by: Scott Wood --- arch/powerpc/platforms/85xx/p5020_ds.c | 5 ----- arch/powerpc/platforms/85xx/p5040_ds.c | 5 ----- arch/powerpc/platforms/85xx/t4240_qds.c | 5 ----- 3 files changed, 15 deletions(-) diff --git a/arch/powerpc/platforms/85xx/p5020_ds.c b/arch/powerpc/platforms/85xx/p5020_ds.c index 753a42c29d4d..39cfa4044e6c 100644 --- a/arch/powerpc/platforms/85xx/p5020_ds.c +++ b/arch/powerpc/platforms/85xx/p5020_ds.c @@ -75,12 +75,7 @@ define_machine(p5020_ds) { #ifdef CONFIG_PCI .pcibios_fixup_bus = fsl_pcibios_fixup_bus, #endif -/* coreint doesn't play nice with lazy EE, use legacy mpic for now */ -#ifdef CONFIG_PPC64 - .get_irq = mpic_get_irq, -#else .get_irq = mpic_get_coreint_irq, -#endif .restart = fsl_rstcr_restart, .calibrate_decr = generic_calibrate_decr, .progress = udbg_progress, diff --git a/arch/powerpc/platforms/85xx/p5040_ds.c b/arch/powerpc/platforms/85xx/p5040_ds.c index 11381851828e..f70e74cddf97 100644 --- a/arch/powerpc/platforms/85xx/p5040_ds.c +++ b/arch/powerpc/platforms/85xx/p5040_ds.c @@ -66,12 +66,7 @@ define_machine(p5040_ds) { #ifdef CONFIG_PCI .pcibios_fixup_bus = fsl_pcibios_fixup_bus, #endif -/* coreint doesn't play nice with lazy EE, use legacy mpic for now */ -#ifdef CONFIG_PPC64 - .get_irq = mpic_get_irq, -#else .get_irq = mpic_get_coreint_irq, -#endif .restart = fsl_rstcr_restart, .calibrate_decr = generic_calibrate_decr, .progress = udbg_progress, diff --git a/arch/powerpc/platforms/85xx/t4240_qds.c b/arch/powerpc/platforms/85xx/t4240_qds.c index 5998e9f33304..91ead6b1b8af 100644 --- a/arch/powerpc/platforms/85xx/t4240_qds.c +++ b/arch/powerpc/platforms/85xx/t4240_qds.c @@ -75,12 +75,7 @@ define_machine(t4240_qds) { #ifdef CONFIG_PCI .pcibios_fixup_bus = fsl_pcibios_fixup_bus, #endif -/* coreint doesn't play nice with lazy EE, use legacy mpic for now */ -#ifdef CONFIG_PPC64 - .get_irq = mpic_get_irq, -#else .get_irq = mpic_get_coreint_irq, -#endif .restart = fsl_rstcr_restart, .calibrate_decr = generic_calibrate_decr, .progress = udbg_progress, From 5ff04b7287d87c1db74f47360365905ed9a97ff7 Mon Sep 17 00:00:00 2001 From: "Dongsheng.wang@freescale.com" Date: Tue, 9 Apr 2013 10:22:29 +0800 Subject: [PATCH 152/156] powerpc/mpic: add irq_set_wake support Add irq_set_wake support. Just add IRQF_NO_SUSPEND to desc->action->flag. So the wake up interrupt will not be disable in suspend_device_irqs. Signed-off-by: Wang Dongsheng Signed-off-by: Scott Wood --- arch/powerpc/sysdev/mpic.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 1a4e19c6a688..4635d11f2dc2 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -920,6 +920,22 @@ int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type) return IRQ_SET_MASK_OK_NOCOPY; } +static int mpic_irq_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_desc *desc = container_of(d, struct irq_desc, irq_data); + struct mpic *mpic = mpic_from_irq_data(d); + + if (!(mpic->flags & MPIC_FSL)) + return -ENXIO; + + if (on) + desc->action->flags |= IRQF_NO_SUSPEND; + else + desc->action->flags &= ~IRQF_NO_SUSPEND; + + return 0; +} + void mpic_set_vector(unsigned int virq, unsigned int vector) { struct mpic *mpic = mpic_from_irq(virq); @@ -957,6 +973,7 @@ static struct irq_chip mpic_irq_chip = { .irq_unmask = mpic_unmask_irq, .irq_eoi = mpic_end_irq, .irq_set_type = mpic_set_irq_type, + .irq_set_wake = mpic_irq_set_wake, }; #ifdef CONFIG_SMP @@ -971,6 +988,7 @@ static struct irq_chip mpic_tm_chip = { .irq_mask = mpic_mask_tm, .irq_unmask = mpic_unmask_tm, .irq_eoi = mpic_end_irq, + .irq_set_wake = mpic_irq_set_wake, }; #ifdef CONFIG_MPIC_U3_HT_IRQS From 36ca09be6ff77a4e5b54b8b68ed7be7aa184250b Mon Sep 17 00:00:00 2001 From: "Dongsheng.wang@freescale.com" Date: Tue, 9 Apr 2013 10:22:30 +0800 Subject: [PATCH 153/156] powerpc/mpic: add global timer support The MPIC global timer is a hardware timer inside the Freescale PIC complying with OpenPIC standard. When the specified interval times out, the hardware timer generates an interrupt. The driver currently is only tested on fsl chip, but it can potentially support other global timers complying to OpenPIC standard. The two independent groups of global timer on fsl chip, group A and group B, are identical in their functionality, except that they appear at different locations within the PIC register map. The hardware timer can be cascaded to create timers larger than the default 31-bit global timers. Timer cascade fields allow configuration of up to two 63-bit timers. But These two groups of timers cannot be cascaded together. It can be used as a wakeup source for low power modes. It also could be used as periodical timer for protocols, drivers and etc. Signed-off-by: Wang Dongsheng Signed-off-by: Li Yang Signed-off-by: Scott Wood --- arch/powerpc/include/asm/mpic_timer.h | 46 ++ arch/powerpc/platforms/Kconfig | 12 + arch/powerpc/sysdev/Makefile | 1 + arch/powerpc/sysdev/mpic_timer.c | 593 ++++++++++++++++++++++++++ 4 files changed, 652 insertions(+) create mode 100644 arch/powerpc/include/asm/mpic_timer.h create mode 100644 arch/powerpc/sysdev/mpic_timer.c diff --git a/arch/powerpc/include/asm/mpic_timer.h b/arch/powerpc/include/asm/mpic_timer.h new file mode 100644 index 000000000000..0e23cd4ac8aa --- /dev/null +++ b/arch/powerpc/include/asm/mpic_timer.h @@ -0,0 +1,46 @@ +/* + * arch/powerpc/include/asm/mpic_timer.h + * + * Header file for Mpic Global Timer + * + * Copyright 2013 Freescale Semiconductor, Inc. + * + * Author: Wang Dongsheng + * Li Yang + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#ifndef __MPIC_TIMER__ +#define __MPIC_TIMER__ + +#include +#include + +struct mpic_timer { + void *dev; + struct cascade_priv *cascade_handle; + unsigned int num; + unsigned int irq; +}; + +#ifdef CONFIG_MPIC_TIMER +struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev, + const struct timeval *time); +void mpic_start_timer(struct mpic_timer *handle); +void mpic_stop_timer(struct mpic_timer *handle); +void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time); +void mpic_free_timer(struct mpic_timer *handle); +#else +struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev, + const struct timeval *time) { return NULL; } +void mpic_start_timer(struct mpic_timer *handle) { } +void mpic_stop_timer(struct mpic_timer *handle) { } +void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time) { } +void mpic_free_timer(struct mpic_timer *handle) { } +#endif + +#endif diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index bed8c607588c..7d07d9e8e1dd 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -86,6 +86,18 @@ config MPIC bool default n +config MPIC_TIMER + bool "MPIC Global Timer" + depends on MPIC && FSL_SOC + default n + help + The MPIC global timer is a hardware timer inside the + Freescale PIC complying with OpenPIC standard. When the + specified interval times out, the hardware timer generates + an interrupt. The driver currently is only tested on fsl + chip, but it can potentially support other global timers + complying with the OpenPIC standard. + config PPC_EPAPR_HV_PIC bool default n diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index 99464a7bdb3b..2eb72c1a4291 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -4,6 +4,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) +obj-$(CONFIG_MPIC_TIMER) += mpic_timer.o mpic-msgr-obj-$(CONFIG_MPIC_MSGR) += mpic_msgr.o obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y) obj-$(CONFIG_PPC_EPAPR_HV_PIC) += ehv_pic.o diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c new file mode 100644 index 000000000000..c06db92a4fb1 --- /dev/null +++ b/arch/powerpc/sysdev/mpic_timer.c @@ -0,0 +1,593 @@ +/* + * MPIC timer driver + * + * Copyright 2013 Freescale Semiconductor, Inc. + * Author: Dongsheng Wang + * Li Yang + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define FSL_GLOBAL_TIMER 0x1 + +/* Clock Ratio + * Divide by 64 0x00000300 + * Divide by 32 0x00000200 + * Divide by 16 0x00000100 + * Divide by 8 0x00000000 (Hardware default div) + */ +#define MPIC_TIMER_TCR_CLKDIV 0x00000300 + +#define MPIC_TIMER_TCR_ROVR_OFFSET 24 + +#define TIMER_STOP 0x80000000 +#define TIMERS_PER_GROUP 4 +#define MAX_TICKS (~0U >> 1) +#define MAX_TICKS_CASCADE (~0U) +#define TIMER_OFFSET(num) (1 << (TIMERS_PER_GROUP - 1 - num)) + +/* tv_usec should be less than ONE_SECOND, otherwise use tv_sec */ +#define ONE_SECOND 1000000 + +struct timer_regs { + u32 gtccr; + u32 res0[3]; + u32 gtbcr; + u32 res1[3]; + u32 gtvpr; + u32 res2[3]; + u32 gtdr; + u32 res3[3]; +}; + +struct cascade_priv { + u32 tcr_value; /* TCR register: CASC & ROVR value */ + unsigned int cascade_map; /* cascade map */ + unsigned int timer_num; /* cascade control timer */ +}; + +struct timer_group_priv { + struct timer_regs __iomem *regs; + struct mpic_timer timer[TIMERS_PER_GROUP]; + struct list_head node; + unsigned int timerfreq; + unsigned int idle; + unsigned int flags; + spinlock_t lock; + void __iomem *group_tcr; +}; + +static struct cascade_priv cascade_timer[] = { + /* cascade timer 0 and 1 */ + {0x1, 0xc, 0x1}, + /* cascade timer 1 and 2 */ + {0x2, 0x6, 0x2}, + /* cascade timer 2 and 3 */ + {0x4, 0x3, 0x3} +}; + +static LIST_HEAD(timer_group_list); + +static void convert_ticks_to_time(struct timer_group_priv *priv, + const u64 ticks, struct timeval *time) +{ + u64 tmp_sec; + + time->tv_sec = (__kernel_time_t)div_u64(ticks, priv->timerfreq); + tmp_sec = (u64)time->tv_sec * (u64)priv->timerfreq; + + time->tv_usec = (__kernel_suseconds_t) + div_u64((ticks - tmp_sec) * 1000000, priv->timerfreq); + + return; +} + +/* the time set by the user is converted to "ticks" */ +static int convert_time_to_ticks(struct timer_group_priv *priv, + const struct timeval *time, u64 *ticks) +{ + u64 max_value; /* prevent u64 overflow */ + u64 tmp = 0; + + u64 tmp_sec; + u64 tmp_ms; + u64 tmp_us; + + max_value = div_u64(ULLONG_MAX, priv->timerfreq); + + if (time->tv_sec > max_value || + (time->tv_sec == max_value && time->tv_usec > 0)) + return -EINVAL; + + tmp_sec = (u64)time->tv_sec * (u64)priv->timerfreq; + tmp += tmp_sec; + + tmp_ms = time->tv_usec / 1000; + tmp_ms = div_u64((u64)tmp_ms * (u64)priv->timerfreq, 1000); + tmp += tmp_ms; + + tmp_us = time->tv_usec % 1000; + tmp_us = div_u64((u64)tmp_us * (u64)priv->timerfreq, 1000000); + tmp += tmp_us; + + *ticks = tmp; + + return 0; +} + +/* detect whether there is a cascade timer available */ +static struct mpic_timer *detect_idle_cascade_timer( + struct timer_group_priv *priv) +{ + struct cascade_priv *casc_priv; + unsigned int map; + unsigned int array_size = ARRAY_SIZE(cascade_timer); + unsigned int num; + unsigned int i; + unsigned long flags; + + casc_priv = cascade_timer; + for (i = 0; i < array_size; i++) { + spin_lock_irqsave(&priv->lock, flags); + map = casc_priv->cascade_map & priv->idle; + if (map == casc_priv->cascade_map) { + num = casc_priv->timer_num; + priv->timer[num].cascade_handle = casc_priv; + + /* set timer busy */ + priv->idle &= ~casc_priv->cascade_map; + spin_unlock_irqrestore(&priv->lock, flags); + return &priv->timer[num]; + } + spin_unlock_irqrestore(&priv->lock, flags); + casc_priv++; + } + + return NULL; +} + +static int set_cascade_timer(struct timer_group_priv *priv, u64 ticks, + unsigned int num) +{ + struct cascade_priv *casc_priv; + u32 tcr; + u32 tmp_ticks; + u32 rem_ticks; + + /* set group tcr reg for cascade */ + casc_priv = priv->timer[num].cascade_handle; + if (!casc_priv) + return -EINVAL; + + tcr = casc_priv->tcr_value | + (casc_priv->tcr_value << MPIC_TIMER_TCR_ROVR_OFFSET); + setbits32(priv->group_tcr, tcr); + + tmp_ticks = div_u64_rem(ticks, MAX_TICKS_CASCADE, &rem_ticks); + + out_be32(&priv->regs[num].gtccr, 0); + out_be32(&priv->regs[num].gtbcr, tmp_ticks | TIMER_STOP); + + out_be32(&priv->regs[num - 1].gtccr, 0); + out_be32(&priv->regs[num - 1].gtbcr, rem_ticks); + + return 0; +} + +static struct mpic_timer *get_cascade_timer(struct timer_group_priv *priv, + u64 ticks) +{ + struct mpic_timer *allocated_timer; + + /* Two cascade timers: Support the maximum time */ + const u64 max_ticks = (u64)MAX_TICKS * (u64)MAX_TICKS_CASCADE; + int ret; + + if (ticks > max_ticks) + return NULL; + + /* detect idle timer */ + allocated_timer = detect_idle_cascade_timer(priv); + if (!allocated_timer) + return NULL; + + /* set ticks to timer */ + ret = set_cascade_timer(priv, ticks, allocated_timer->num); + if (ret < 0) + return NULL; + + return allocated_timer; +} + +static struct mpic_timer *get_timer(const struct timeval *time) +{ + struct timer_group_priv *priv; + struct mpic_timer *timer; + + u64 ticks; + unsigned int num; + unsigned int i; + unsigned long flags; + int ret; + + list_for_each_entry(priv, &timer_group_list, node) { + ret = convert_time_to_ticks(priv, time, &ticks); + if (ret < 0) + return NULL; + + if (ticks > MAX_TICKS) { + if (!(priv->flags & FSL_GLOBAL_TIMER)) + return NULL; + + timer = get_cascade_timer(priv, ticks); + if (!timer) + continue; + + return timer; + } + + for (i = 0; i < TIMERS_PER_GROUP; i++) { + /* one timer: Reverse allocation */ + num = TIMERS_PER_GROUP - 1 - i; + spin_lock_irqsave(&priv->lock, flags); + if (priv->idle & (1 << i)) { + /* set timer busy */ + priv->idle &= ~(1 << i); + /* set ticks & stop timer */ + out_be32(&priv->regs[num].gtbcr, + ticks | TIMER_STOP); + out_be32(&priv->regs[num].gtccr, 0); + priv->timer[num].cascade_handle = NULL; + spin_unlock_irqrestore(&priv->lock, flags); + return &priv->timer[num]; + } + spin_unlock_irqrestore(&priv->lock, flags); + } + } + + return NULL; +} + +/** + * mpic_start_timer - start hardware timer + * @handle: the timer to be started. + * + * It will do ->fn(->dev) callback from the hardware interrupt at + * the ->timeval point in the future. + */ +void mpic_start_timer(struct mpic_timer *handle) +{ + struct timer_group_priv *priv = container_of(handle, + struct timer_group_priv, timer[handle->num]); + + clrbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP); +} +EXPORT_SYMBOL(mpic_start_timer); + +/** + * mpic_stop_timer - stop hardware timer + * @handle: the timer to be stoped + * + * The timer periodically generates an interrupt. Unless user stops the timer. + */ +void mpic_stop_timer(struct mpic_timer *handle) +{ + struct timer_group_priv *priv = container_of(handle, + struct timer_group_priv, timer[handle->num]); + struct cascade_priv *casc_priv; + + setbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP); + + casc_priv = priv->timer[handle->num].cascade_handle; + if (casc_priv) { + out_be32(&priv->regs[handle->num].gtccr, 0); + out_be32(&priv->regs[handle->num - 1].gtccr, 0); + } else { + out_be32(&priv->regs[handle->num].gtccr, 0); + } +} +EXPORT_SYMBOL(mpic_stop_timer); + +/** + * mpic_get_remain_time - get timer time + * @handle: the timer to be selected. + * @time: time for timer + * + * Query timer remaining time. + */ +void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time) +{ + struct timer_group_priv *priv = container_of(handle, + struct timer_group_priv, timer[handle->num]); + struct cascade_priv *casc_priv; + + u64 ticks; + u32 tmp_ticks; + + casc_priv = priv->timer[handle->num].cascade_handle; + if (casc_priv) { + tmp_ticks = in_be32(&priv->regs[handle->num].gtccr); + ticks = ((u64)tmp_ticks & UINT_MAX) * (u64)MAX_TICKS_CASCADE; + tmp_ticks = in_be32(&priv->regs[handle->num - 1].gtccr); + ticks += tmp_ticks; + } else { + ticks = in_be32(&priv->regs[handle->num].gtccr); + } + + convert_ticks_to_time(priv, ticks, time); +} +EXPORT_SYMBOL(mpic_get_remain_time); + +/** + * mpic_free_timer - free hardware timer + * @handle: the timer to be removed. + * + * Free the timer. + * + * Note: can not be used in interrupt context. + */ +void mpic_free_timer(struct mpic_timer *handle) +{ + struct timer_group_priv *priv = container_of(handle, + struct timer_group_priv, timer[handle->num]); + + struct cascade_priv *casc_priv; + unsigned long flags; + + mpic_stop_timer(handle); + + casc_priv = priv->timer[handle->num].cascade_handle; + + free_irq(priv->timer[handle->num].irq, priv->timer[handle->num].dev); + + spin_lock_irqsave(&priv->lock, flags); + if (casc_priv) { + u32 tcr; + tcr = casc_priv->tcr_value | (casc_priv->tcr_value << + MPIC_TIMER_TCR_ROVR_OFFSET); + clrbits32(priv->group_tcr, tcr); + priv->idle |= casc_priv->cascade_map; + priv->timer[handle->num].cascade_handle = NULL; + } else { + priv->idle |= TIMER_OFFSET(handle->num); + } + spin_unlock_irqrestore(&priv->lock, flags); +} +EXPORT_SYMBOL(mpic_free_timer); + +/** + * mpic_request_timer - get a hardware timer + * @fn: interrupt handler function + * @dev: callback function of the data + * @time: time for timer + * + * This executes the "request_irq", returning NULL + * else "handle" on success. + */ +struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev, + const struct timeval *time) +{ + struct mpic_timer *allocated_timer; + int ret; + + if (list_empty(&timer_group_list)) + return NULL; + + if (!(time->tv_sec + time->tv_usec) || + time->tv_sec < 0 || time->tv_usec < 0) + return NULL; + + if (time->tv_usec > ONE_SECOND) + return NULL; + + allocated_timer = get_timer(time); + if (!allocated_timer) + return NULL; + + ret = request_irq(allocated_timer->irq, fn, + IRQF_TRIGGER_LOW, "global-timer", dev); + if (ret) { + mpic_free_timer(allocated_timer); + return NULL; + } + + allocated_timer->dev = dev; + + return allocated_timer; +} +EXPORT_SYMBOL(mpic_request_timer); + +static int timer_group_get_freq(struct device_node *np, + struct timer_group_priv *priv) +{ + u32 div; + + if (priv->flags & FSL_GLOBAL_TIMER) { + struct device_node *dn; + + dn = of_find_compatible_node(NULL, NULL, "fsl,mpic"); + if (dn) { + of_property_read_u32(dn, "clock-frequency", + &priv->timerfreq); + of_node_put(dn); + } + } + + if (priv->timerfreq <= 0) + return -EINVAL; + + if (priv->flags & FSL_GLOBAL_TIMER) { + div = (1 << (MPIC_TIMER_TCR_CLKDIV >> 8)) * 8; + priv->timerfreq /= div; + } + + return 0; +} + +static int timer_group_get_irq(struct device_node *np, + struct timer_group_priv *priv) +{ + const u32 all_timer[] = { 0, TIMERS_PER_GROUP }; + const u32 *p; + u32 offset; + u32 count; + + unsigned int i; + unsigned int j; + unsigned int irq_index = 0; + unsigned int irq; + int len; + + p = of_get_property(np, "fsl,available-ranges", &len); + if (p && len % (2 * sizeof(u32)) != 0) { + pr_err("%s: malformed available-ranges property.\n", + np->full_name); + return -EINVAL; + } + + if (!p) { + p = all_timer; + len = sizeof(all_timer); + } + + len /= 2 * sizeof(u32); + + for (i = 0; i < len; i++) { + offset = p[i * 2]; + count = p[i * 2 + 1]; + for (j = 0; j < count; j++) { + irq = irq_of_parse_and_map(np, irq_index); + if (!irq) { + pr_err("%s: irq parse and map failed.\n", + np->full_name); + return -EINVAL; + } + + /* Set timer idle */ + priv->idle |= TIMER_OFFSET((offset + j)); + priv->timer[offset + j].irq = irq; + priv->timer[offset + j].num = offset + j; + irq_index++; + } + } + + return 0; +} + +static void timer_group_init(struct device_node *np) +{ + struct timer_group_priv *priv; + unsigned int i = 0; + int ret; + + priv = kzalloc(sizeof(struct timer_group_priv), GFP_KERNEL); + if (!priv) { + pr_err("%s: cannot allocate memory for group.\n", + np->full_name); + return; + } + + if (of_device_is_compatible(np, "fsl,mpic-global-timer")) + priv->flags |= FSL_GLOBAL_TIMER; + + priv->regs = of_iomap(np, i++); + if (!priv->regs) { + pr_err("%s: cannot ioremap timer register address.\n", + np->full_name); + goto out; + } + + if (priv->flags & FSL_GLOBAL_TIMER) { + priv->group_tcr = of_iomap(np, i++); + if (!priv->group_tcr) { + pr_err("%s: cannot ioremap tcr address.\n", + np->full_name); + goto out; + } + } + + ret = timer_group_get_freq(np, priv); + if (ret < 0) { + pr_err("%s: cannot get timer frequency.\n", np->full_name); + goto out; + } + + ret = timer_group_get_irq(np, priv); + if (ret < 0) { + pr_err("%s: cannot get timer irqs.\n", np->full_name); + goto out; + } + + spin_lock_init(&priv->lock); + + /* Init FSL timer hardware */ + if (priv->flags & FSL_GLOBAL_TIMER) + setbits32(priv->group_tcr, MPIC_TIMER_TCR_CLKDIV); + + list_add_tail(&priv->node, &timer_group_list); + + return; + +out: + if (priv->regs) + iounmap(priv->regs); + + if (priv->group_tcr) + iounmap(priv->group_tcr); + + kfree(priv); +} + +static void mpic_timer_resume(void) +{ + struct timer_group_priv *priv; + + list_for_each_entry(priv, &timer_group_list, node) { + /* Init FSL timer hardware */ + if (priv->flags & FSL_GLOBAL_TIMER) + setbits32(priv->group_tcr, MPIC_TIMER_TCR_CLKDIV); + } +} + +static const struct of_device_id mpic_timer_ids[] = { + { .compatible = "fsl,mpic-global-timer", }, + {}, +}; + +static struct syscore_ops mpic_timer_syscore_ops = { + .resume = mpic_timer_resume, +}; + +static int __init mpic_timer_init(void) +{ + struct device_node *np = NULL; + + for_each_matching_node(np, mpic_timer_ids) + timer_group_init(np); + + register_syscore_ops(&mpic_timer_syscore_ops); + + if (list_empty(&timer_group_list)) + return -ENODEV; + + return 0; +} +subsys_initcall(mpic_timer_init); From 9e6f31a9dbc58f6a5661f8dc8c919441b8d3ced4 Mon Sep 17 00:00:00 2001 From: "Dongsheng.wang@freescale.com" Date: Tue, 9 Apr 2013 10:22:31 +0800 Subject: [PATCH 154/156] powerpc/mpic: create mpic subsystem object Register a mpic subsystem at /sys/devices/system/ Signed-off-by: Wang Dongsheng Signed-off-by: Scott Wood --- arch/powerpc/include/asm/mpic.h | 2 ++ arch/powerpc/sysdev/mpic.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index ea6bf7220da9..4a1ac9fbf186 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -339,6 +339,8 @@ struct mpic #endif }; +extern struct bus_type mpic_subsys; + /* * MPIC flags (passed to mpic_alloc) * diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 4635d11f2dc2..1be54faf60dd 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -48,6 +48,12 @@ #define DBG(fmt...) #endif +struct bus_type mpic_subsys = { + .name = "mpic", + .dev_name = "mpic", +}; +EXPORT_SYMBOL_GPL(mpic_subsys); + static struct mpic *mpics; static struct mpic *mpic_primary; static DEFINE_RAW_SPINLOCK(mpic_lock); @@ -2035,6 +2041,8 @@ static struct syscore_ops mpic_syscore_ops = { static int mpic_init_sys(void) { register_syscore_ops(&mpic_syscore_ops); + subsys_system_register(&mpic_subsys, NULL); + return 0; } From a63b3bc7db32b63bfe5f48fa8582f931db81c86e Mon Sep 17 00:00:00 2001 From: "Dongsheng.wang@freescale.com" Date: Tue, 9 Apr 2013 10:22:32 +0800 Subject: [PATCH 155/156] powerpc/fsl: add MPIC timer wakeup support The driver provides a way to wake up the system by the MPIC timer. For example, echo 5 > /sys/devices/system/mpic/timer_wakeup echo standby > /sys/power/state After 5 seconds the MPIC timer will generate an interrupt to wake up the system. Signed-off-by: Wang Dongsheng Signed-off-by: Zhao Chenhui Signed-off-by: Li Yang --- arch/powerpc/platforms/Kconfig | 9 ++ arch/powerpc/sysdev/Makefile | 1 + arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c | 161 ++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 7d07d9e8e1dd..0847ee6cd616 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -98,6 +98,15 @@ config MPIC_TIMER chip, but it can potentially support other global timers complying with the OpenPIC standard. +config FSL_MPIC_TIMER_WAKEUP + tristate "Freescale MPIC global timer wakeup driver" + depends on FSL_SOC && MPIC_TIMER && PM + default n + help + The driver provides a way to wake up the system by MPIC + timer. + e.g. "echo 5 > /sys/devices/system/mpic/timer_wakeup" + config PPC_EPAPR_HV_PIC bool default n diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index 2eb72c1a4291..f67ac900d870 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -5,6 +5,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) obj-$(CONFIG_MPIC_TIMER) += mpic_timer.o +obj-$(CONFIG_FSL_MPIC_TIMER_WAKEUP) += fsl_mpic_timer_wakeup.o mpic-msgr-obj-$(CONFIG_MPIC_MSGR) += mpic_msgr.o obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y) obj-$(CONFIG_PPC_EPAPR_HV_PIC) += ehv_pic.o diff --git a/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c new file mode 100644 index 000000000000..1707bf04dec6 --- /dev/null +++ b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c @@ -0,0 +1,161 @@ +/* + * MPIC timer wakeup driver + * + * Copyright 2013 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +struct fsl_mpic_timer_wakeup { + struct mpic_timer *timer; + struct work_struct free_work; +}; + +static struct fsl_mpic_timer_wakeup *fsl_wakeup; +static DEFINE_MUTEX(sysfs_lock); + +static void fsl_free_resource(struct work_struct *ws) +{ + struct fsl_mpic_timer_wakeup *wakeup = + container_of(ws, struct fsl_mpic_timer_wakeup, free_work); + + mutex_lock(&sysfs_lock); + + if (wakeup->timer) { + disable_irq_wake(wakeup->timer->irq); + mpic_free_timer(wakeup->timer); + } + + wakeup->timer = NULL; + mutex_unlock(&sysfs_lock); +} + +static irqreturn_t fsl_mpic_timer_irq(int irq, void *dev_id) +{ + struct fsl_mpic_timer_wakeup *wakeup = dev_id; + + schedule_work(&wakeup->free_work); + + return wakeup->timer ? IRQ_HANDLED : IRQ_NONE; +} + +static ssize_t fsl_timer_wakeup_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct timeval interval; + int val = 0; + + mutex_lock(&sysfs_lock); + if (fsl_wakeup->timer) { + mpic_get_remain_time(fsl_wakeup->timer, &interval); + val = interval.tv_sec + 1; + } + mutex_unlock(&sysfs_lock); + + return sprintf(buf, "%d\n", val); +} + +static ssize_t fsl_timer_wakeup_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct timeval interval; + int ret; + + interval.tv_usec = 0; + if (kstrtol(buf, 0, &interval.tv_sec)) + return -EINVAL; + + mutex_lock(&sysfs_lock); + + if (fsl_wakeup->timer) { + disable_irq_wake(fsl_wakeup->timer->irq); + mpic_free_timer(fsl_wakeup->timer); + fsl_wakeup->timer = NULL; + } + + if (!interval.tv_sec) { + mutex_unlock(&sysfs_lock); + return count; + } + + fsl_wakeup->timer = mpic_request_timer(fsl_mpic_timer_irq, + fsl_wakeup, &interval); + if (!fsl_wakeup->timer) { + mutex_unlock(&sysfs_lock); + return -EINVAL; + } + + ret = enable_irq_wake(fsl_wakeup->timer->irq); + if (ret) { + mpic_free_timer(fsl_wakeup->timer); + fsl_wakeup->timer = NULL; + mutex_unlock(&sysfs_lock); + + return ret; + } + + mpic_start_timer(fsl_wakeup->timer); + + mutex_unlock(&sysfs_lock); + + return count; +} + +static struct device_attribute mpic_attributes = __ATTR(timer_wakeup, 0644, + fsl_timer_wakeup_show, fsl_timer_wakeup_store); + +static int __init fsl_wakeup_sys_init(void) +{ + int ret; + + fsl_wakeup = kzalloc(sizeof(struct fsl_mpic_timer_wakeup), GFP_KERNEL); + if (!fsl_wakeup) + return -ENOMEM; + + INIT_WORK(&fsl_wakeup->free_work, fsl_free_resource); + + ret = device_create_file(mpic_subsys.dev_root, &mpic_attributes); + if (ret) + kfree(fsl_wakeup); + + return ret; +} + +static void __exit fsl_wakeup_sys_exit(void) +{ + device_remove_file(mpic_subsys.dev_root, &mpic_attributes); + + mutex_lock(&sysfs_lock); + + if (fsl_wakeup->timer) { + disable_irq_wake(fsl_wakeup->timer->irq); + mpic_free_timer(fsl_wakeup->timer); + } + + kfree(fsl_wakeup); + + mutex_unlock(&sysfs_lock); +} + +module_init(fsl_wakeup_sys_init); +module_exit(fsl_wakeup_sys_exit); + +MODULE_DESCRIPTION("Freescale MPIC global timer wakeup driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Wang Dongsheng "); From 1d8b368ab4aacfc3f864655baad4d31a3028ec1a Mon Sep 17 00:00:00 2001 From: Aruna Balakrishnaiah Date: Tue, 2 Jul 2013 11:06:54 +0530 Subject: [PATCH 156/156] pstore: Add hsize argument in write_buf call of pstore_ftrace_call Incorporate the addition of hsize argument in write_buf callback of pstore. This was forgotten in 6bbbca735936e15b9431882eceddcf6dff76e03c pstore: Pass header size in the pstore write callback Causing a build failure when ftrace and pstore are enabled. Signed-off-by: Aruna Balakrishnaiah Signed-off-by: Benjamin Herrenschmidt --- fs/pstore/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 43b12807a51d..76a4eeb92982 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.parent_ip = parent_ip; pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, - sizeof(rec), psinfo); + 0, sizeof(rec), psinfo); local_irq_restore(flags); }