GHC optimization issue

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
5 messages Options
Reply | Threaded
Open this post in threaded view
|

GHC optimization issue

Joel Reymont
Folks,

I have been trying to improve my byte swapping routines as part of my  
effort to speed up serialization. I then tried to look at the core  
output from GHC to see what it was converting my code into. Brandon  
(skew on #haskell) helped me code a TH version but then I went with a  
regular CPP version instead.

The point of contention is that the logical approach to optimization  
does not produce expected results. The only difference between the  
following two core outputs is the presence or absence of specialize  
and inline pragmas. The code is the same.

The version without pragmas inlines foo1 into

Main.foo1 :: GHC.Word.Word16
[GlobalId]
[Str: DmdType]
Main.foo1 = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
             of ww1_a2eo { __DEFAULT ->
             case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of  
ww_a2f4 { __DEFAULT ->
             GHC.Word.W16# (GHC.Prim.narrow16Word#  
(GHC.Prim.plusWord# ww1_a2eo ww_a2f4))
             }
             }

Whereas the version _with_ pragmas produces a function call:

Main.foo1 :: GHC.Word.Word16
[GlobalId]
[Str: DmdType]
Main.foo1 = case GHC.Word.Word16 Swap1.$w$sswap16 __word 3855
             of ww1_s1Ia { __DEFAULT ->
             GHC.Word.W16# ww1_s1Ia
             }

Is there a reasonable explanation?

Both versions compiled thusly: ghc --make Foo.hs -O -ddump-simpl > foo

Swap:

{-# OPTIONS_GHC -fglasgow-exts -cpp #-}
module Swap1
(
swap16
)
where

import Data.Word
import Data.Int
import Data.Bits

#define BIG_ENDIAN 1

{-# SPECIALIZE swap16 :: Word16 -> Word16 #-}
{-# SPECIALIZE swap16 :: Int16 -> Int16 #-}
{-# INLINE swap16 #-}

swap16 :: Bits a => a -> a
#ifdef BIG_ENDIAN
swap16 v = (v `shiftR` 8) + ((v .&. 0xFF) `shiftL` 8)
#else
swap16 v = v
#endif

Foo:

module Main where

import Data.Word
import Swap1

foo1 :: Word16
foo1 = swap16 0x0f0f

main = putStrLn $ show foo1

Core output for the version WITH pragmas, see the version w/o pragmas  
way below

==================== Tidy Core Rules ====================
"SPEC Swap1.swap16" __forall {$dBits_X1Co :: {Data.Bits.Bits  
GHC.Int.Int16}}
                       Swap1.swap16 @ GHC.Int.Int16 $dBits_X1Co
                       = Swap1.$sswap16 ;
"SPEC Swap1.swap16" __forall {$dBits_X1Cv :: {Data.Bits.Bits  
GHC.Word.Word16}}
                       Swap1.swap16 @ GHC.Word.Word16 $dBits_X1Cv
                       = Swap1.$sswap161 ;


==================== Tidy Core ====================
Main.foo1 :: GHC.Word.Word16
[GlobalId]
[Str: DmdType]
Main.foo1 = case GHC.Word.Word16 Swap1.$w$sswap16 __word 3855
             of ww1_s1Ia { __DEFAULT ->
             GHC.Word.W16# ww1_s1Ia
             }

==================== Tidy Core ====================
Swap1.$w$sswap16 :: GHC.Prim.Word# -> GHC.Prim.Word#
[GlobalId]
[Arity 1
NoCafRefs
Str: DmdType L]
Swap1.$w$sswap16 = \ (ww_s1I5 :: GHC.Prim.Word#) ->
                      case GHC.Prim.Word# GHC.Word.$wshift3 ww_s1I5  
(-8) of ww1_a1Gx { __DEFAULT ->
                      case GHC.Prim.Word# GHC.Word.$wshift3  
(GHC.Prim.and# ww_s1I5 __word 255) 8
                      of ww11_a1Hd { __DEFAULT ->
                      GHC.Prim.narrow16Word# (GHC.Prim.plusWord#  
ww1_a1Gx ww11_a1Hd)
                      }
                      }

Swap1.$w$sswap161 :: GHC.Prim.Int# -> GHC.Prim.Int#
[GlobalId]
[Arity 1
NoCafRefs
Str: DmdType L]
Swap1.$w$sswap161 = \ (ww_s1HV :: GHC.Prim.Int#) ->
                       case GHC.Prim.Int# GHC.Int.$wshift2 ww_s1HV  
(-8) of ww1_a1Fk { __DEFAULT ->
                       case GHC.Prim.Int# GHC.Int.$wshift2
                                            (GHC.Prim.word2Int#  
(GHC.Prim.and# (GHC.Prim.int2Word# ww_s1HV) __word 255))
                                            8
                       of ww11_a1G5 { __DEFAULT ->
                       GHC.Prim.narrow16Int# (GHC.Prim.+# ww1_a1Fk  
ww11_a1G5)
                       }
                       }

Swap1.$sswap16 :: GHC.Int.Int16 -> GHC.Int.Int16
[GlobalId]
[Arity 1
Worker Swap1.$w$sswap161
NoCafRefs
Str: DmdType U(L)m]
Swap1.$sswap16 = __inline_me (\ (w_s1HT :: GHC.Int.Int16) ->
                                 case GHC.Int.Int16 w_s1HT of w1_X1I5  
{ GHC.Int.I16# ww_s1HV ->
                                 case GHC.Int.Int16 Swap1.$w$sswap161  
ww_s1HV of ww1_s1I0 { __DEFAULT ->
                                 GHC.Int.I16# ww1_s1I0
                                 }
                                 })

Swap1.$sswap161 :: GHC.Word.Word16 -> GHC.Word.Word16
[GlobalId]
[Arity 1
Worker Swap1.$w$sswap16
NoCafRefs
Str: DmdType U(L)m]
Swap1.$sswap161 = __inline_me (\ (w_s1I3 :: GHC.Word.Word16) ->
                                  case GHC.Word.Word16 w_s1I3 of  
w1_X1Ih { GHC.Word.W16# ww_s1I5 ->
                                  case GHC.Word.Word16 Swap1.$w
$sswap16 ww_s1I5 of ww1_s1Ia { __DEFAULT ->
                                  GHC.Word.W16# ww1_s1Ia
                                  }
                                  })

Swap1.swap16 :: forall a_a1aQ. (Data.Bits.Bits a_a1aQ) => a_a1aQ ->  
a_a1aQ
[GlobalId]
[Arity 1
NoCafRefs
Str: DmdType L]
Swap1.swap16 = __inline_me (\ (@ a_a1dB)
                               ($dBits_a1ip :: {Data.Bits.Bits  
a_a1dB}) ->
                               let {
                                 $dNum_s1BO :: {GHC.Num.Num a_a1dB}
                                 [Str: DmdType {a1ip->U
(SAAAAAAAAAAAAAAAAA)}]
                                 $dNum_s1BO = Data.Bits.$p1Bits @  
a_a1dB $dBits_a1ip } in
                               let {
                                 lit_s1BN :: a_a1dB
                                 [Str: DmdType {a1ip->U
(SAAAAAAAAAAAAAAAAA) s1BO->U(AAAAAAAAS)}]
                                 lit_s1BN = GHC.Num.fromInteger @  
a_a1dB $dNum_s1BO (GHC.Num.S# 255)
                               } in
                                 \ (v_a1aW :: a_a1dB) ->
                                   GHC.Num.+
                                     @ a_a1dB
                                     $dNum_s1BO
                                     (Data.Bits.shiftR @ a_a1dB  
$dBits_a1ip v_a1aW (GHC.Base.I# 8))
                                     (Data.Bits.shiftL
                                        @ a_a1dB
                                        $dBits_a1ip
                                        (Data.Bits..&. @ a_a1dB  
$dBits_a1ip v_a1aW lit_s1BN)
                                        (GHC.Base.I# 8)))


Core output without pragmas:

Main.foo1 :: GHC.Word.Word16
[GlobalId]
[Str: DmdType]
Main.foo1 = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
             of ww1_a2eo { __DEFAULT ->
             case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of  
ww_a2f4 { __DEFAULT ->
             GHC.Word.W16# (GHC.Prim.narrow16Word#  
(GHC.Prim.plusWord# ww1_a2eo ww_a2f4))
             }
             }

---
Swap1.swap16 :: forall a_a1aQ. (Data.Bits.Bits a_a1aQ) => a_a1aQ ->  
a_a1aQ
[GlobalId]
[Arity 1
NoCafRefs
Str: DmdType L]
Swap1.swap16 = \ (@ a_a1dB) ($dBits_a1ip :: {Data.Bits.Bits a_a1dB}) ->
                  let {
                    $dNum_s1BK :: {GHC.Num.Num a_a1dB}
                    [Str: DmdType]
                    $dNum_s1BK = case {GHC.Num.Num a_a1dB} $dBits_a1ip
                                 of tpl_B1 { Data.Bits.:DBits tpl1_B2  
tpl2_B3 tpl3_B4 tpl4_B5 tpl5_B6 tpl6_B7 tpl7_B8 tpl8_B9 tpl9_Ba  
tpl10_Bb tpl11_Bc tpl12_Bd tpl13_Be tpl14_Bf tpl15_Bg tpl16_Bh  
tpl17_Bi tpl18_Bj ->
                                 tpl1_B2
                                 } } in
                  let {
                    lit_s1BM :: a_a1dB
                    [Str: DmdType]
                    lit_s1BM = case a_a1dB $dNum_s1BK
                               of tpl_B1 { GHC.Num.:DNum tpl1_B2  
tpl2_B3 tpl3_B4 tpl4_B5 tpl5_B6 tpl6_B7 tpl7_B8 tpl8_B9 tpl9_Ba ->
                               tpl9_Ba Swap1.lvl1
                               }
                  } in
                    \ (v_a1aW :: a_a1dB) ->
                      tpl3_B4
                        (case a_a1dB $dBits_a1ip
                         of tpl10_Xy { Data.Bits.:DBits tpl11_XB  
tpl12_XD tpl13_XF tpl14_XH tpl15_XJ tpl16_XL tpl17_XN tpl18_XP  
tpl19_XR tpl20_Bb tpl21_Bc tpl22_Bd tpl23_Be tpl24_Bf tpl25_Bg  
tpl26_Bh tpl27_Bi tpl28_Bj ->
                         tpl26_Bh v_a1aW Swap1.lvl
                         })
                        (case a_a1dB $dBits_a1ip
                         of tpl10_Xy { Data.Bits.:DBits tpl11_XB  
tpl12_XD tpl13_XF tpl14_XH tpl15_XJ tpl16_XL tpl17_XN tpl18_XP  
tpl19_XR tpl20_Bb tpl21_Bc tpl22_Bd tpl23_Be tpl24_Bf tpl25_Bg  
tpl26_Bh tpl27_Bi tpl28_Bj ->
                         tpl25_Bg (tpl12_XD v_a1aW lit_s1BM) Swap1.lvl
                         })

--
http://wagerlabs.com/





_______________________________________________
Haskell-Cafe mailing list
[hidden email]
http://www.haskell.org/mailman/listinfo/haskell-cafe
Reply | Threaded
Open this post in threaded view
|

Re: GHC Optimization Issue

Brandon Moore
I think what's happening here is that the rewrite rule SPECIALIZE is
producing turns the call to swap16 into a call to the specialized
verision before any inlining happens[1], and then the specialized
version can't be inlined[2].

Manually writing out a copy of swap16 at type Word16 -> Word16, marking
it INLINE and setting up a rewite rule for specialization gives you the
more efficient code:

changing the definition to swap16 to this:

swap16 :: Bits a => a -> a
swap16 v = (v `shiftR` 8) + ((v .&. 0xFF) `shiftL` 8)

{- # RULES "swap16/Word16" forall (v :: Int16) . swap16 v = swap16Word16
v #-}
{- # INLINE swap16Word16 #-}

swap16Word16 :: Word16 -> Word16
swap16Word16 v = (v `shiftR` 8) + ((v .&. 0xFF) `shiftL` 8)

gives this Core for foo:
Main.foo :: GHC.Word.Word16
[GlobalId]
[Str: DmdType]
Main.foo = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
            of ww1 { __DEFAULT ->
            case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of ww {
__DEFAULT ->
            GHC.Word.W16# (GHC.Prim.narrow16Word# (GHC.Prim.plusWord#
ww1 ww))
            }
            }

and for swapWord16:

Swap.swap16Word16 :: GHC.Word.Word16 -> GHC.Word.Word16
[GlobalId]
[Arity 1
  NoCafRefs
  Str: DmdType U(L)m]
Swap.swap16Word16 = __inline_me (\ (v :: GHC.Word.Word16) ->
                                    GHC.Word.+2
                                      (GHC.Word.$dmshiftR2 v
(GHC.Base.I# 8))
                                      (GHC.Word.shift2 (GHC.Word..&.2 v
Swap.lit) (GHC.Base.I# 8)))


Removing the INLINE directive for swap16Word16 leaves this code for foo,
with a call to the worker function for swap16Word16:

Main.foo :: GHC.Word.Word16
[GlobalId]
[Str: DmdType]
Main.foo = case GHC.Word.Word16 Swap.$wswap16Word16 __word 3855
            of ww1 { __DEFAULT ->
            GHC.Word.W16# ww1
            }

and these definitions for swap16Word16:

Swap.$wswap16Word16 :: GHC.Prim.Word# -> GHC.Prim.Word#
[GlobalId]
[Arity 1
  NoCafRefs
  Str: DmdType L]
Swap.$wswap16Word16 = \ (ww :: GHC.Prim.Word#) ->
                         case GHC.Prim.Word# GHC.Word.$wshift3 ww (-8)
of ww1 { __DEFAULT ->
                         case GHC.Prim.Word# GHC.Word.$wshift3
(GHC.Prim.and# ww __word 255) 8
                         of ww11 { __DEFAULT ->
                         GHC.Prim.narrow16Word# (GHC.Prim.plusWord# ww1
ww11)
                         }
                         }

Swap.swap16Word16 :: GHC.Word.Word16 -> GHC.Word.Word16
[GlobalId]
[Arity 1
  Worker Swap.$wswap16Word16
  NoCafRefs
  Str: DmdType U(L)m]
Swap.swap16Word16 = __inline_me (\ (w :: GHC.Word.Word16) ->
                                    case GHC.Word.Word16 w of w1 {
GHC.Word.W16# ww ->
                                    case GHC.Word.Word16
Swap.$wswap16Word16 ww of ww1 { __DEFAULT ->
                                    GHC.Word.W16# ww1
                                    }
                                    })

It looks like marking the function INLINE supresses the worker/wrapper
split, and when there is not INLINE and the function get split, only the
wrapper is eligible for inlining.

Brandon Moore

[1] (section 10.7.2 of the 6.4.1 User's guide says "In the earlier
phases of compilation, GHC inlines nothing that appears on the LHS of a
rule, because once you have substituted for something you can't match
against it (given the simple minded matching).")

[2] Maybe because the worker isn't annotation with __inline_me in the
Core? Maybe from the aside in 7.10.5 about and INLINE pragma preventing
any inlining in the RHS of the thing marked inline?
_______________________________________________
Haskell-Cafe mailing list
[hidden email]
http://www.haskell.org/mailman/listinfo/haskell-cafe
Reply | Threaded
Open this post in threaded view
|

RE: GHC optimization issue

Simon Peyton Jones
In reply to this post by Joel Reymont
(Brief because I'm at home.)

Use INLINE (alone) to include swap at its call sites.  (That gives
perfect per-call-site specialisation.)

Use SPECIALISE (alone) to make a specialised copy of swap, which is then
not duplicated at each call site, but instead is called.

No need to use them both.

The fact that GHC inlines the (big) unspecialised version and not the
(apparently smaller) specialised version, is something we should look
into.

Simon

| -----Original Message-----
| From: Joel Reymont [mailto:[hidden email]]
| Sent: 22 December 2005 14:09
| To: Haskell-Cafe Cafe
| Cc: Simon Peyton-Jones; Simon Marlow
| Subject: GHC optimization issue
|
| Folks,
|
| I have been trying to improve my byte swapping routines as part of my
| effort to speed up serialization. I then tried to look at the core
| output from GHC to see what it was converting my code into. Brandon
| (skew on #haskell) helped me code a TH version but then I went with a
| regular CPP version instead.
|
| The point of contention is that the logical approach to optimization
| does not produce expected results. The only difference between the
| following two core outputs is the presence or absence of specialize
| and inline pragmas. The code is the same.
|
| The version without pragmas inlines foo1 into
|
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
|              of ww1_a2eo { __DEFAULT ->
|              case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of
| ww_a2f4 { __DEFAULT ->
|              GHC.Word.W16# (GHC.Prim.narrow16Word#
| (GHC.Prim.plusWord# ww1_a2eo ww_a2f4))
|              }
|              }
|
| Whereas the version _with_ pragmas produces a function call:
|
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 Swap1.$w$sswap16 __word 3855
|              of ww1_s1Ia { __DEFAULT ->
|              GHC.Word.W16# ww1_s1Ia
|              }
|
| Is there a reasonable explanation?
|
| Both versions compiled thusly: ghc --make Foo.hs -O -ddump-simpl > foo
|
| Swap:
|
| {-# OPTIONS_GHC -fglasgow-exts -cpp #-}
| module Swap1
| (
| swap16
| )
| where
|
| import Data.Word
| import Data.Int
| import Data.Bits
|
| #define BIG_ENDIAN 1
|
| {-# SPECIALIZE swap16 :: Word16 -> Word16 #-}
| {-# SPECIALIZE swap16 :: Int16 -> Int16 #-}
| {-# INLINE swap16 #-}
|
| swap16 :: Bits a => a -> a
| #ifdef BIG_ENDIAN
| swap16 v = (v `shiftR` 8) + ((v .&. 0xFF) `shiftL` 8)
| #else
| swap16 v = v
| #endif
|
| Foo:
|
| module Main where
|
| import Data.Word
| import Swap1
|
| foo1 :: Word16
| foo1 = swap16 0x0f0f
|
| main = putStrLn $ show foo1
|
| Core output for the version WITH pragmas, see the version w/o pragmas
| way below
|
| ==================== Tidy Core Rules ====================
| "SPEC Swap1.swap16" __forall {$dBits_X1Co :: {Data.Bits.Bits
| GHC.Int.Int16}}
|                        Swap1.swap16 @ GHC.Int.Int16 $dBits_X1Co
|                        = Swap1.$sswap16 ;
| "SPEC Swap1.swap16" __forall {$dBits_X1Cv :: {Data.Bits.Bits
| GHC.Word.Word16}}
|                        Swap1.swap16 @ GHC.Word.Word16 $dBits_X1Cv
|                        = Swap1.$sswap161 ;
|
|
| ==================== Tidy Core ====================
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 Swap1.$w$sswap16 __word 3855
|              of ww1_s1Ia { __DEFAULT ->
|              GHC.Word.W16# ww1_s1Ia
|              }
|
| ==================== Tidy Core ====================
| Swap1.$w$sswap16 :: GHC.Prim.Word# -> GHC.Prim.Word#
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.$w$sswap16 = \ (ww_s1I5 :: GHC.Prim.Word#) ->
|                       case GHC.Prim.Word# GHC.Word.$wshift3 ww_s1I5
| (-8) of ww1_a1Gx { __DEFAULT ->
|                       case GHC.Prim.Word# GHC.Word.$wshift3
| (GHC.Prim.and# ww_s1I5 __word 255) 8
|                       of ww11_a1Hd { __DEFAULT ->
|                       GHC.Prim.narrow16Word# (GHC.Prim.plusWord#
| ww1_a1Gx ww11_a1Hd)
|                       }
|                       }
|
| Swap1.$w$sswap161 :: GHC.Prim.Int# -> GHC.Prim.Int#
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.$w$sswap161 = \ (ww_s1HV :: GHC.Prim.Int#) ->
|                        case GHC.Prim.Int# GHC.Int.$wshift2 ww_s1HV
| (-8) of ww1_a1Fk { __DEFAULT ->
|                        case GHC.Prim.Int# GHC.Int.$wshift2
|                                             (GHC.Prim.word2Int#
| (GHC.Prim.and# (GHC.Prim.int2Word# ww_s1HV) __word 255))
|                                             8
|                        of ww11_a1G5 { __DEFAULT ->
|                        GHC.Prim.narrow16Int# (GHC.Prim.+# ww1_a1Fk
| ww11_a1G5)
|                        }
|                        }
|
| Swap1.$sswap16 :: GHC.Int.Int16 -> GHC.Int.Int16
| [GlobalId]
| [Arity 1
| Worker Swap1.$w$sswap161
| NoCafRefs
| Str: DmdType U(L)m]
| Swap1.$sswap16 = __inline_me (\ (w_s1HT :: GHC.Int.Int16) ->
|                                  case GHC.Int.Int16 w_s1HT of w1_X1I5
| { GHC.Int.I16# ww_s1HV ->
|                                  case GHC.Int.Int16 Swap1.$w$sswap161
| ww_s1HV of ww1_s1I0 { __DEFAULT ->
|                                  GHC.Int.I16# ww1_s1I0
|                                  }
|                                  })
|
| Swap1.$sswap161 :: GHC.Word.Word16 -> GHC.Word.Word16
| [GlobalId]
| [Arity 1
| Worker Swap1.$w$sswap16
| NoCafRefs
| Str: DmdType U(L)m]
| Swap1.$sswap161 = __inline_me (\ (w_s1I3 :: GHC.Word.Word16) ->
|                                   case GHC.Word.Word16 w_s1I3 of
| w1_X1Ih { GHC.Word.W16# ww_s1I5 ->
|                                   case GHC.Word.Word16 Swap1.$w
| $sswap16 ww_s1I5 of ww1_s1Ia { __DEFAULT ->
|                                   GHC.Word.W16# ww1_s1Ia
|                                   }
|                                   })
|
| Swap1.swap16 :: forall a_a1aQ. (Data.Bits.Bits a_a1aQ) => a_a1aQ ->
| a_a1aQ
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.swap16 = __inline_me (\ (@ a_a1dB)
|                                ($dBits_a1ip :: {Data.Bits.Bits
| a_a1dB}) ->
|                                let {
|                                  $dNum_s1BO :: {GHC.Num.Num a_a1dB}
|                                  [Str: DmdType {a1ip->U
| (SAAAAAAAAAAAAAAAAA)}]
|                                  $dNum_s1BO = Data.Bits.$p1Bits @
| a_a1dB $dBits_a1ip } in
|                                let {
|                                  lit_s1BN :: a_a1dB
|                                  [Str: DmdType {a1ip->U
| (SAAAAAAAAAAAAAAAAA) s1BO->U(AAAAAAAAS)}]
|                                  lit_s1BN = GHC.Num.fromInteger @
| a_a1dB $dNum_s1BO (GHC.Num.S# 255)
|                                } in
|                                  \ (v_a1aW :: a_a1dB) ->
|                                    GHC.Num.+
|                                      @ a_a1dB
|                                      $dNum_s1BO
|                                      (Data.Bits.shiftR @ a_a1dB
| $dBits_a1ip v_a1aW (GHC.Base.I# 8))
|                                      (Data.Bits.shiftL
|                                         @ a_a1dB
|                                         $dBits_a1ip
|                                         (Data.Bits..&. @ a_a1dB
| $dBits_a1ip v_a1aW lit_s1BN)
|                                         (GHC.Base.I# 8)))
|
|
| Core output without pragmas:
|
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
|              of ww1_a2eo { __DEFAULT ->
|              case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of
| ww_a2f4 { __DEFAULT ->
|              GHC.Word.W16# (GHC.Prim.narrow16Word#
| (GHC.Prim.plusWord# ww1_a2eo ww_a2f4))
|              }
|              }
|
| ---
| Swap1.swap16 :: forall a_a1aQ. (Data.Bits.Bits a_a1aQ) => a_a1aQ ->
| a_a1aQ
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.swap16 = \ (@ a_a1dB) ($dBits_a1ip :: {Data.Bits.Bits a_a1dB})
->
|                   let {
|                     $dNum_s1BK :: {GHC.Num.Num a_a1dB}
|                     [Str: DmdType]
|                     $dNum_s1BK = case {GHC.Num.Num a_a1dB} $dBits_a1ip
|                                  of tpl_B1 { Data.Bits.:DBits tpl1_B2
| tpl2_B3 tpl3_B4 tpl4_B5 tpl5_B6 tpl6_B7 tpl7_B8 tpl8_B9 tpl9_Ba
| tpl10_Bb tpl11_Bc tpl12_Bd tpl13_Be tpl14_Bf tpl15_Bg tpl16_Bh
| tpl17_Bi tpl18_Bj ->
|                                  tpl1_B2
|                                  } } in
|                   let {
|                     lit_s1BM :: a_a1dB
|                     [Str: DmdType]
|                     lit_s1BM = case a_a1dB $dNum_s1BK
|                                of tpl_B1 { GHC.Num.:DNum tpl1_B2
| tpl2_B3 tpl3_B4 tpl4_B5 tpl5_B6 tpl6_B7 tpl7_B8 tpl8_B9 tpl9_Ba ->
|                                tpl9_Ba Swap1.lvl1
|                                }
|                   } in
|                     \ (v_a1aW :: a_a1dB) ->
|                       tpl3_B4
|                         (case a_a1dB $dBits_a1ip
|                          of tpl10_Xy { Data.Bits.:DBits tpl11_XB
| tpl12_XD tpl13_XF tpl14_XH tpl15_XJ tpl16_XL tpl17_XN tpl18_XP
| tpl19_XR tpl20_Bb tpl21_Bc tpl22_Bd tpl23_Be tpl24_Bf tpl25_Bg
| tpl26_Bh tpl27_Bi tpl28_Bj ->
|                          tpl26_Bh v_a1aW Swap1.lvl
|                          })
|                         (case a_a1dB $dBits_a1ip
|                          of tpl10_Xy { Data.Bits.:DBits tpl11_XB
| tpl12_XD tpl13_XF tpl14_XH tpl15_XJ tpl16_XL tpl17_XN tpl18_XP
| tpl19_XR tpl20_Bb tpl21_Bc tpl22_Bd tpl23_Be tpl24_Bf tpl25_Bg
| tpl26_Bh tpl27_Bi tpl28_Bj ->
|                          tpl25_Bg (tpl12_XD v_a1aW lit_s1BM) Swap1.lvl
|                          })
|
| --
| http://wagerlabs.com/
|
|
|
|

_______________________________________________
Haskell-Cafe mailing list
[hidden email]
http://www.haskell.org/mailman/listinfo/haskell-cafe
Reply | Threaded
Open this post in threaded view
|

Re: GHC optimization issue

Joel Reymont
How expensive is a GHC function call?

How do I make a decision whether to specialize or to inline?

On Dec 22, 2005, at 10:06 PM, Simon Peyton-Jones wrote:

> (Brief because I'm at home.)
>
> Use INLINE (alone) to include swap at its call sites.  (That gives
> perfect per-call-site specialisation.)
>
> Use SPECIALISE (alone) to make a specialised copy of swap, which is  
> then
> not duplicated at each call site, but instead is called.

--
http://wagerlabs.com/





_______________________________________________
Haskell-Cafe mailing list
[hidden email]
http://www.haskell.org/mailman/listinfo/haskell-cafe
Reply | Threaded
Open this post in threaded view
|

RE: GHC optimization issue

Simon Peyton Jones
In reply to this post by Joel Reymont
I did look into this a little.  There are several things going on

* GHC doesn't really expect you to use INLINE and SPECIALISE together.
INLINE says to inline a copy of the function at every call site, which
is the best possible form of specialisation, so SPECIALISE is a bit
redundant if you are happy to do that.  (Incidentally, INLINE is ignored
for recursive functions, whereas SPECIALISE works fine.)

* If you have both INLINE and SPECIALISE, GHC makes a specialised copy
of the function, and the specialised copy takes precedence. So a call at
the specialised type turns into a call to the specialised function.   (A
call at any other type would not use the specialisation, and so the
un-specialised function would be inlined.)  Moreover, the specialised
copies are themselves not marked INLINE.

* GHC's HEAD in CVS (i.e 6.5) has a new feature
        {#- SPECIALISE INLINE f :: <type> #-}
which tells GHC to specialise f and inline the specialised copy (only).
Ordinary, un-specialised calls will not be inlined.

* When you delete the pragmas, GHC uses a size-based heuristic to decide
when to inline a function at a call site.  Oddly, this heuristic decides
(just) to inline the un-specialised function, but when it's specialised
GHC thinks it's bigger and doesn't inline it.    You can increase the
threshold for inlining with -funfolding-use-threshold=20, say.
       
http://www.haskell.org/ghc/docs/latest/html/users_guide/options-optimise
.html#options-f
This really is a bit odd, but I don't want to fiddle with GHC's inlining
heuristics without doing lots of benchmark runs etc.  Actually, it'd be
quite an easy thing for others to try fiddling with, and I bet there are
performance gains to be had.

Simon

| -----Original Message-----
| From: Joel Reymont [mailto:[hidden email]]
| Sent: 22 December 2005 14:09
| To: Haskell-Cafe Cafe
| Cc: Simon Peyton-Jones; Simon Marlow
| Subject: GHC optimization issue
|
| Folks,
|
| I have been trying to improve my byte swapping routines as part of my
| effort to speed up serialization. I then tried to look at the core
| output from GHC to see what it was converting my code into. Brandon
| (skew on #haskell) helped me code a TH version but then I went with a
| regular CPP version instead.
|
| The point of contention is that the logical approach to optimization
| does not produce expected results. The only difference between the
| following two core outputs is the presence or absence of specialize
| and inline pragmas. The code is the same.
|
| The version without pragmas inlines foo1 into
|
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
|              of ww1_a2eo { __DEFAULT ->
|              case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of
| ww_a2f4 { __DEFAULT ->
|              GHC.Word.W16# (GHC.Prim.narrow16Word#
| (GHC.Prim.plusWord# ww1_a2eo ww_a2f4))
|              }
|              }
|
| Whereas the version _with_ pragmas produces a function call:
|
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 Swap1.$w$sswap16 __word 3855
|              of ww1_s1Ia { __DEFAULT ->
|              GHC.Word.W16# ww1_s1Ia
|              }
|
| Is there a reasonable explanation?
|
| Both versions compiled thusly: ghc --make Foo.hs -O -ddump-simpl > foo
|
| Swap:
|
| {-# OPTIONS_GHC -fglasgow-exts -cpp #-}
| module Swap1
| (
| swap16
| )
| where
|
| import Data.Word
| import Data.Int
| import Data.Bits
|
| #define BIG_ENDIAN 1
|
| {-# SPECIALIZE swap16 :: Word16 -> Word16 #-}
| {-# SPECIALIZE swap16 :: Int16 -> Int16 #-}
| {-# INLINE swap16 #-}
|
| swap16 :: Bits a => a -> a
| #ifdef BIG_ENDIAN
| swap16 v = (v `shiftR` 8) + ((v .&. 0xFF) `shiftL` 8)
| #else
| swap16 v = v
| #endif
|
| Foo:
|
| module Main where
|
| import Data.Word
| import Swap1
|
| foo1 :: Word16
| foo1 = swap16 0x0f0f
|
| main = putStrLn $ show foo1
|
| Core output for the version WITH pragmas, see the version w/o pragmas
| way below
|
| ==================== Tidy Core Rules ====================
| "SPEC Swap1.swap16" __forall {$dBits_X1Co :: {Data.Bits.Bits
| GHC.Int.Int16}}
|                        Swap1.swap16 @ GHC.Int.Int16 $dBits_X1Co
|                        = Swap1.$sswap16 ;
| "SPEC Swap1.swap16" __forall {$dBits_X1Cv :: {Data.Bits.Bits
| GHC.Word.Word16}}
|                        Swap1.swap16 @ GHC.Word.Word16 $dBits_X1Cv
|                        = Swap1.$sswap161 ;
|
|
| ==================== Tidy Core ====================
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 Swap1.$w$sswap16 __word 3855
|              of ww1_s1Ia { __DEFAULT ->
|              GHC.Word.W16# ww1_s1Ia
|              }
|
| ==================== Tidy Core ====================
| Swap1.$w$sswap16 :: GHC.Prim.Word# -> GHC.Prim.Word#
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.$w$sswap16 = \ (ww_s1I5 :: GHC.Prim.Word#) ->
|                       case GHC.Prim.Word# GHC.Word.$wshift3 ww_s1I5
| (-8) of ww1_a1Gx { __DEFAULT ->
|                       case GHC.Prim.Word# GHC.Word.$wshift3
| (GHC.Prim.and# ww_s1I5 __word 255) 8
|                       of ww11_a1Hd { __DEFAULT ->
|                       GHC.Prim.narrow16Word# (GHC.Prim.plusWord#
| ww1_a1Gx ww11_a1Hd)
|                       }
|                       }
|
| Swap1.$w$sswap161 :: GHC.Prim.Int# -> GHC.Prim.Int#
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.$w$sswap161 = \ (ww_s1HV :: GHC.Prim.Int#) ->
|                        case GHC.Prim.Int# GHC.Int.$wshift2 ww_s1HV
| (-8) of ww1_a1Fk { __DEFAULT ->
|                        case GHC.Prim.Int# GHC.Int.$wshift2
|                                             (GHC.Prim.word2Int#
| (GHC.Prim.and# (GHC.Prim.int2Word# ww_s1HV) __word 255))
|                                             8
|                        of ww11_a1G5 { __DEFAULT ->
|                        GHC.Prim.narrow16Int# (GHC.Prim.+# ww1_a1Fk
| ww11_a1G5)
|                        }
|                        }
|
| Swap1.$sswap16 :: GHC.Int.Int16 -> GHC.Int.Int16
| [GlobalId]
| [Arity 1
| Worker Swap1.$w$sswap161
| NoCafRefs
| Str: DmdType U(L)m]
| Swap1.$sswap16 = __inline_me (\ (w_s1HT :: GHC.Int.Int16) ->
|                                  case GHC.Int.Int16 w_s1HT of w1_X1I5
| { GHC.Int.I16# ww_s1HV ->
|                                  case GHC.Int.Int16 Swap1.$w$sswap161
| ww_s1HV of ww1_s1I0 { __DEFAULT ->
|                                  GHC.Int.I16# ww1_s1I0
|                                  }
|                                  })
|
| Swap1.$sswap161 :: GHC.Word.Word16 -> GHC.Word.Word16
| [GlobalId]
| [Arity 1
| Worker Swap1.$w$sswap16
| NoCafRefs
| Str: DmdType U(L)m]
| Swap1.$sswap161 = __inline_me (\ (w_s1I3 :: GHC.Word.Word16) ->
|                                   case GHC.Word.Word16 w_s1I3 of
| w1_X1Ih { GHC.Word.W16# ww_s1I5 ->
|                                   case GHC.Word.Word16 Swap1.$w
| $sswap16 ww_s1I5 of ww1_s1Ia { __DEFAULT ->
|                                   GHC.Word.W16# ww1_s1Ia
|                                   }
|                                   })
|
| Swap1.swap16 :: forall a_a1aQ. (Data.Bits.Bits a_a1aQ) => a_a1aQ ->
| a_a1aQ
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.swap16 = __inline_me (\ (@ a_a1dB)
|                                ($dBits_a1ip :: {Data.Bits.Bits
| a_a1dB}) ->
|                                let {
|                                  $dNum_s1BO :: {GHC.Num.Num a_a1dB}
|                                  [Str: DmdType {a1ip->U
| (SAAAAAAAAAAAAAAAAA)}]
|                                  $dNum_s1BO = Data.Bits.$p1Bits @
| a_a1dB $dBits_a1ip } in
|                                let {
|                                  lit_s1BN :: a_a1dB
|                                  [Str: DmdType {a1ip->U
| (SAAAAAAAAAAAAAAAAA) s1BO->U(AAAAAAAAS)}]
|                                  lit_s1BN = GHC.Num.fromInteger @
| a_a1dB $dNum_s1BO (GHC.Num.S# 255)
|                                } in
|                                  \ (v_a1aW :: a_a1dB) ->
|                                    GHC.Num.+
|                                      @ a_a1dB
|                                      $dNum_s1BO
|                                      (Data.Bits.shiftR @ a_a1dB
| $dBits_a1ip v_a1aW (GHC.Base.I# 8))
|                                      (Data.Bits.shiftL
|                                         @ a_a1dB
|                                         $dBits_a1ip
|                                         (Data.Bits..&. @ a_a1dB
| $dBits_a1ip v_a1aW lit_s1BN)
|                                         (GHC.Base.I# 8)))
|
|
| Core output without pragmas:
|
| Main.foo1 :: GHC.Word.Word16
| [GlobalId]
| [Str: DmdType]
| Main.foo1 = case GHC.Word.Word16 GHC.Word.$wshift3 __word 3855 (-8)
|              of ww1_a2eo { __DEFAULT ->
|              case GHC.Word.Word16 GHC.Word.$wshift3 __word 15 8 of
| ww_a2f4 { __DEFAULT ->
|              GHC.Word.W16# (GHC.Prim.narrow16Word#
| (GHC.Prim.plusWord# ww1_a2eo ww_a2f4))
|              }
|              }
|
| ---
| Swap1.swap16 :: forall a_a1aQ. (Data.Bits.Bits a_a1aQ) => a_a1aQ ->
| a_a1aQ
| [GlobalId]
| [Arity 1
| NoCafRefs
| Str: DmdType L]
| Swap1.swap16 = \ (@ a_a1dB) ($dBits_a1ip :: {Data.Bits.Bits a_a1dB})
->
|                   let {
|                     $dNum_s1BK :: {GHC.Num.Num a_a1dB}
|                     [Str: DmdType]
|                     $dNum_s1BK = case {GHC.Num.Num a_a1dB} $dBits_a1ip
|                                  of tpl_B1 { Data.Bits.:DBits tpl1_B2
| tpl2_B3 tpl3_B4 tpl4_B5 tpl5_B6 tpl6_B7 tpl7_B8 tpl8_B9 tpl9_Ba
| tpl10_Bb tpl11_Bc tpl12_Bd tpl13_Be tpl14_Bf tpl15_Bg tpl16_Bh
| tpl17_Bi tpl18_Bj ->
|                                  tpl1_B2
|                                  } } in
|                   let {
|                     lit_s1BM :: a_a1dB
|                     [Str: DmdType]
|                     lit_s1BM = case a_a1dB $dNum_s1BK
|                                of tpl_B1 { GHC.Num.:DNum tpl1_B2
| tpl2_B3 tpl3_B4 tpl4_B5 tpl5_B6 tpl6_B7 tpl7_B8 tpl8_B9 tpl9_Ba ->
|                                tpl9_Ba Swap1.lvl1
|                                }
|                   } in
|                     \ (v_a1aW :: a_a1dB) ->
|                       tpl3_B4
|                         (case a_a1dB $dBits_a1ip
|                          of tpl10_Xy { Data.Bits.:DBits tpl11_XB
| tpl12_XD tpl13_XF tpl14_XH tpl15_XJ tpl16_XL tpl17_XN tpl18_XP
| tpl19_XR tpl20_Bb tpl21_Bc tpl22_Bd tpl23_Be tpl24_Bf tpl25_Bg
| tpl26_Bh tpl27_Bi tpl28_Bj ->
|                          tpl26_Bh v_a1aW Swap1.lvl
|                          })
|                         (case a_a1dB $dBits_a1ip
|                          of tpl10_Xy { Data.Bits.:DBits tpl11_XB
| tpl12_XD tpl13_XF tpl14_XH tpl15_XJ tpl16_XL tpl17_XN tpl18_XP
| tpl19_XR tpl20_Bb tpl21_Bc tpl22_Bd tpl23_Be tpl24_Bf tpl25_Bg
| tpl26_Bh tpl27_Bi tpl28_Bj ->
|                          tpl25_Bg (tpl12_XD v_a1aW lit_s1BM) Swap1.lvl
|                          })
|
| --
| http://wagerlabs.com/
|
|
|
|

_______________________________________________
Haskell-Cafe mailing list
[hidden email]
http://www.haskell.org/mailman/listinfo/haskell-cafe