cmd/compile, cmd/internal: fine-grained fiddling with loop alignment

This appears to be useful only on amd64, and was specifically
benchmarked on Apple Silicon and did not produce any benefit there.
This CL adds the assembly instruction `PCALIGNMAX align,amount`
which aligns to `align` if that can be achieved with `amount`
or fewer bytes of padding. (0 means never, but will align the
enclosing function.)

Specifically, if low-order-address-bits + amount are
greater than or equal to align; thus, `PCALIGNMAX 64,63` is
the same as `PCALIGN 64` and `PCALIGNMAX 64,0` will never
emit any alignment, but will still cause the function itself
to be aligned to (at least) 64 bytes.

Change-Id: Id51a056f1672f8095e8f755e01f72836c9686aa3
Reviewed-on: https://go-review.googlesource.com/c/go/+/577935
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/cmd/compile/internal/base/debug.go b/src/cmd/compile/internal/base/debug.go
index 672e390..c1b62f2 100644
--- a/src/cmd/compile/internal/base/debug.go
+++ b/src/cmd/compile/internal/base/debug.go
@@ -16,6 +16,7 @@
 // The -d option takes a comma-separated list of settings.
 // Each setting is name=value; for ints, name is short for name=1.
 type DebugFlags struct {
+	AlignHot              int    `help:"enable hot block alignment (currently requires -pgo)" concurrent:"ok"`
 	Append                int    `help:"print information about append compilation"`
 	Checkptr              int    `help:"instrument unsafe pointer conversions\n0: instrumentation disabled\n1: conversions involving unsafe.Pointer are instrumented\n2: conversions to unsafe.Pointer force heap allocation" concurrent:"ok"`
 	Closure               int    `help:"print information about closure compilation"`
diff --git a/src/cmd/compile/internal/base/flag.go b/src/cmd/compile/internal/base/flag.go
index 0d3c7c2..f514ce1 100644
--- a/src/cmd/compile/internal/base/flag.go
+++ b/src/cmd/compile/internal/base/flag.go
@@ -178,6 +178,7 @@
 
 	Debug.ConcurrentOk = true
 	Debug.MaxShapeLen = 500
+	Debug.AlignHot = 1
 	Debug.InlFuncsWithClosures = 1
 	Debug.InlStaticInit = 1
 	Debug.PGOInline = 1
diff --git a/src/cmd/compile/internal/gc/compile.go b/src/cmd/compile/internal/gc/compile.go
index 0f57f8c..159fd29 100644
--- a/src/cmd/compile/internal/gc/compile.go
+++ b/src/cmd/compile/internal/gc/compile.go
@@ -14,6 +14,7 @@
 	"cmd/compile/internal/ir"
 	"cmd/compile/internal/liveness"
 	"cmd/compile/internal/objw"
+	"cmd/compile/internal/pgoir"
 	"cmd/compile/internal/ssagen"
 	"cmd/compile/internal/staticinit"
 	"cmd/compile/internal/types"
@@ -112,7 +113,7 @@
 // compileFunctions compiles all functions in compilequeue.
 // It fans out nBackendWorkers to do the work
 // and waits for them to complete.
-func compileFunctions() {
+func compileFunctions(profile *pgoir.Profile) {
 	if race.Enabled {
 		// Randomize compilation order to try to shake out races.
 		tmp := make([]*ir.Func, len(compilequeue))
@@ -179,7 +180,7 @@
 		for _, fn := range fns {
 			fn := fn
 			queue(func(worker int) {
-				ssagen.Compile(fn, worker)
+				ssagen.Compile(fn, worker, profile)
 				compile(fn.Closures)
 				wg.Done()
 			})
diff --git a/src/cmd/compile/internal/gc/main.go b/src/cmd/compile/internal/gc/main.go
index 7ab64f4..41f5e43 100644
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@@ -303,7 +303,7 @@
 		// as late as possible to maximize how much work we can batch and
 		// process concurrently.
 		if len(compilequeue) != 0 {
-			compileFunctions()
+			compileFunctions(profile)
 			continue
 		}
 
diff --git a/src/cmd/compile/internal/inline/inl.go b/src/cmd/compile/internal/inline/inl.go
index 931f795..ed3d3d4 100644
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@@ -61,6 +61,9 @@
 	// TODO(prattmic): Make this non-global.
 	candHotCalleeMap = make(map[*pgoir.IRNode]struct{})
 
+	// Set of functions that contain hot call sites.
+	hasHotCall = make(map[*ir.Func]struct{})
+
 	// List of all hot call sites. CallSiteInfo.Callee is always nil.
 	// TODO(prattmic): Make this non-global.
 	candHotEdgeMap = make(map[pgoir.CallSiteInfo]struct{})
@@ -78,6 +81,22 @@
 	inlineHotMaxBudget int32 = 2000
 )
 
+func IsPgoHotFunc(fn *ir.Func, profile *pgoir.Profile) bool {
+	if profile == nil {
+		return false
+	}
+	if n, ok := profile.WeightedCG.IRNodes[ir.LinkFuncName(fn)]; ok {
+		_, ok := candHotCalleeMap[n]
+		return ok
+	}
+	return false
+}
+
+func HasPgoHotInline(fn *ir.Func) bool {
+	_, has := hasHotCall[fn]
+	return has
+}
+
 // PGOInlinePrologue records the hot callsites from ir-graph.
 func PGOInlinePrologue(p *pgoir.Profile) {
 	if base.Debug.PGOInlineCDFThreshold != "" {
@@ -228,14 +247,10 @@
 func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose bool) int32 {
 	// Update the budget for profile-guided inlining.
 	budget := int32(inlineMaxBudget)
-	if profile != nil {
-		if n, ok := profile.WeightedCG.IRNodes[ir.LinkFuncName(fn)]; ok {
-			if _, ok := candHotCalleeMap[n]; ok {
-				budget = inlineHotMaxBudget
-				if verbose {
-					fmt.Printf("hot-node enabled increased budget=%v for func=%v\n", budget, ir.PkgFuncName(fn))
-				}
-			}
+	if IsPgoHotFunc(fn, profile) {
+		budget = inlineHotMaxBudget
+		if verbose {
+			fmt.Printf("hot-node enabled increased budget=%v for func=%v\n", budget, ir.PkgFuncName(fn))
 		}
 	}
 	if relaxed {
@@ -580,7 +595,7 @@
 			// Check whether we'd actually inline this call. Set
 			// log == false since we aren't actually doing inlining
 			// yet.
-			if ok, _ := canInlineCallExpr(v.curFunc, n, callee, v.isBigFunc, false); ok {
+			if ok, _, _ := canInlineCallExpr(v.curFunc, n, callee, v.isBigFunc, false); ok {
 				// mkinlcall would inline this call [1], so use
 				// the cost of the inline body as the cost of
 				// the call, as that is what will actually
@@ -873,10 +888,11 @@
 // inlineCostOK returns true if call n from caller to callee is cheap enough to
 // inline. bigCaller indicates that caller is a big function.
 //
-// In addition to the "cost OK" boolean, it also returns the "max
-// cost" limit used to make the decision (which may differ depending
-// on func size), and the score assigned to this specific callsite.
-func inlineCostOK(n *ir.CallExpr, caller, callee *ir.Func, bigCaller bool) (bool, int32, int32) {
+// In addition to the "cost OK" boolean, it also returns
+//   - the "max cost" limit used to make the decision (which may differ depending on func size)
+//   - the score assigned to this specific callsite
+//   - whether the inlined function is "hot" according to PGO.
+func inlineCostOK(n *ir.CallExpr, caller, callee *ir.Func, bigCaller bool) (bool, int32, int32, bool) {
 	maxCost := int32(inlineMaxBudget)
 	if bigCaller {
 		// We use this to restrict inlining into very big functions.
@@ -892,19 +908,21 @@
 		}
 	}
 
+	lineOffset := pgoir.NodeLineOffset(n, caller)
+	csi := pgoir.CallSiteInfo{LineOffset: lineOffset, Caller: caller}
+	_, hot := candHotEdgeMap[csi]
+
 	if metric <= maxCost {
 		// Simple case. Function is already cheap enough.
-		return true, 0, metric
+		return true, 0, metric, hot
 	}
 
 	// We'll also allow inlining of hot functions below inlineHotMaxBudget,
 	// but only in small functions.
 
-	lineOffset := pgoir.NodeLineOffset(n, caller)
-	csi := pgoir.CallSiteInfo{LineOffset: lineOffset, Caller: caller}
-	if _, ok := candHotEdgeMap[csi]; !ok {
+	if !hot {
 		// Cold
-		return false, maxCost, metric
+		return false, maxCost, metric, false
 	}
 
 	// Hot
@@ -913,49 +931,50 @@
 		if base.Debug.PGODebug > 0 {
 			fmt.Printf("hot-big check disallows inlining for call %s (cost %d) at %v in big function %s\n", ir.PkgFuncName(callee), callee.Inl.Cost, ir.Line(n), ir.PkgFuncName(caller))
 		}
-		return false, maxCost, metric
+		return false, maxCost, metric, false
 	}
 
 	if metric > inlineHotMaxBudget {
-		return false, inlineHotMaxBudget, metric
+		return false, inlineHotMaxBudget, metric, false
 	}
 
 	if !base.PGOHash.MatchPosWithInfo(n.Pos(), "inline", nil) {
 		// De-selected by PGO Hash.
-		return false, maxCost, metric
+		return false, maxCost, metric, false
 	}
 
 	if base.Debug.PGODebug > 0 {
 		fmt.Printf("hot-budget check allows inlining for call %s (cost %d) at %v in function %s\n", ir.PkgFuncName(callee), callee.Inl.Cost, ir.Line(n), ir.PkgFuncName(caller))
 	}
 
-	return true, 0, metric
+	return true, 0, metric, hot
 }
 
 // canInlineCallExpr returns true if the call n from caller to callee
-// can be inlined, plus the score computed for the call expr in
-// question. bigCaller indicates that caller is a big function. log
+// can be inlined, plus the score computed for the call expr in question,
+// and whether the callee is hot according to PGO.
+// bigCaller indicates that caller is a big function. log
 // indicates that the 'cannot inline' reason should be logged.
 //
 // Preconditions: CanInline(callee) has already been called.
-func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCaller bool, log bool) (bool, int32) {
+func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCaller bool, log bool) (bool, int32, bool) {
 	if callee.Inl == nil {
 		// callee is never inlinable.
 		if log && logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
 				fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(callee)))
 		}
-		return false, 0
+		return false, 0, false
 	}
 
-	ok, maxCost, callSiteScore := inlineCostOK(n, callerfn, callee, bigCaller)
+	ok, maxCost, callSiteScore, hot := inlineCostOK(n, callerfn, callee, bigCaller)
 	if !ok {
 		// callee cost too high for this call site.
 		if log && logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
 				fmt.Sprintf("cost %d of %s exceeds max caller cost %d", callee.Inl.Cost, ir.PkgFuncName(callee), maxCost))
 		}
-		return false, 0
+		return false, 0, false
 	}
 
 	if callee == callerfn {
@@ -963,7 +982,7 @@
 		if log && logopt.Enabled() {
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", fmt.Sprintf("recursive call to %s", ir.FuncName(callerfn)))
 		}
-		return false, 0
+		return false, 0, false
 	}
 
 	if base.Flag.Cfg.Instrumenting && types.IsNoInstrumentPkg(callee.Sym().Pkg) {
@@ -977,7 +996,7 @@
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
 				fmt.Sprintf("call to runtime function %s in instrumented build", ir.PkgFuncName(callee)))
 		}
-		return false, 0
+		return false, 0, false
 	}
 
 	if base.Flag.Race && types.IsNoRacePkg(callee.Sym().Pkg) {
@@ -985,7 +1004,7 @@
 			logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
 				fmt.Sprintf(`call to into "no-race" package function %s in race build`, ir.PkgFuncName(callee)))
 		}
-		return false, 0
+		return false, 0, false
 	}
 
 	// Check if we've already inlined this function at this particular
@@ -1008,11 +1027,11 @@
 						fmt.Sprintf("repeated recursive cycle to %s", ir.PkgFuncName(callee)))
 				}
 			}
-			return false, 0
+			return false, 0, false
 		}
 	}
 
-	return true, callSiteScore
+	return true, callSiteScore, hot
 }
 
 // mkinlcall returns an OINLCALL node that can replace OCALLFUNC n, or
@@ -1023,10 +1042,13 @@
 //
 //	n.Left = mkinlcall(n.Left, fn, isddd)
 func mkinlcall(callerfn *ir.Func, n *ir.CallExpr, fn *ir.Func, bigCaller bool) *ir.InlinedCallExpr {
-	ok, score := canInlineCallExpr(callerfn, n, fn, bigCaller, true)
+	ok, score, hot := canInlineCallExpr(callerfn, n, fn, bigCaller, true)
 	if !ok {
 		return nil
 	}
+	if hot {
+		hasHotCall[callerfn] = struct{}{}
+	}
 	typecheck.AssertFixedCall(n)
 
 	parent := base.Ctxt.PosTable.Pos(n.Pos()).Base().InliningIndex()
diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go
index 26af10b..02733ea 100644
--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@@ -31,6 +31,9 @@
 	// After flagalloc, records whether flags are live at the end of the block.
 	FlagsLiveAtEnd bool
 
+	// A block that would be good to align (according to the optimizer's guesses)
+	Hotness Hotness
+
 	// Subsequent blocks, if any. The number and order depend on the block kind.
 	Succs []Edge
 
@@ -112,7 +115,7 @@
 }
 
 // BlockKind is the kind of SSA block.
-type BlockKind int16
+type BlockKind uint8
 
 // short form print
 func (b *Block) String() string {
@@ -426,3 +429,17 @@
 	BranchUnknown  = BranchPrediction(0)
 	BranchLikely   = BranchPrediction(+1)
 )
+
+type Hotness int8 // Could use negative numbers for specifically non-hot blocks, but don't, yet.
+const (
+	// These values are arranged in what seems to be order of increasing alignment importance.
+	// Currently only a few are relevant.  Implicitly, they are all in a loop.
+	HotNotFlowIn Hotness = 1 << iota // This block is only reached by branches
+	HotInitial                       // In the block order, the first one for a given loop.  Not necessarily topological header.
+	HotPgo                           // By PGO-based heuristics, this block occurs in a hot loop
+
+	HotNot                 = 0
+	HotInitialNotFlowIn    = HotInitial | HotNotFlowIn          // typically first block of a rotated loop, loop is entered with a branch (not to this block).  No PGO
+	HotPgoInitial          = HotPgo | HotInitial                // special case; single block loop, initial block is header block has a flow-in entry, but PGO says it is hot
+	HotPgoInitialNotFLowIn = HotPgo | HotInitial | HotNotFlowIn // PGO says it is hot, and the loop is rotated so flow enters loop with a branch
+)
diff --git a/src/cmd/compile/internal/ssa/func.go b/src/cmd/compile/internal/ssa/func.go
index 38b459a..2bb34a4 100644
--- a/src/cmd/compile/internal/ssa/func.go
+++ b/src/cmd/compile/internal/ssa/func.go
@@ -45,6 +45,7 @@
 	laidout     bool  // Blocks are ordered
 	NoSplit     bool  // true if function is marked as nosplit.  Used by schedule check pass.
 	dumpFileSeq uint8 // the sequence numbers of dump file. (%s_%02d__%s.dump", funcname, dumpFileSeq, phaseName)
+	IsPgoHot    bool
 
 	// when register allocation is done, maps value ids to locations
 	RegAlloc []Location
diff --git a/src/cmd/compile/internal/ssa/looprotate.go b/src/cmd/compile/internal/ssa/looprotate.go
index 844a8f7..f321255 100644
--- a/src/cmd/compile/internal/ssa/looprotate.go
+++ b/src/cmd/compile/internal/ssa/looprotate.go
@@ -56,9 +56,20 @@
 			}
 			p = e.b
 		}
-		if p == nil || p == b {
+		if p == nil {
 			continue
 		}
+		p.Hotness |= HotInitial
+		if f.IsPgoHot {
+			p.Hotness |= HotPgo
+		}
+		// blocks will be arranged so that p is ordered first, if it isn't already.
+		if p == b { // p is header, already first (and also, only block in the loop)
+			continue
+		}
+		p.Hotness |= HotNotFlowIn
+
+		// the loop header b follows p
 		after[p.ID] = []*Block{b}
 		for {
 			nextIdx := idToIdx[b.ID] + 1
diff --git a/src/cmd/compile/internal/ssagen/pgen.go b/src/cmd/compile/internal/ssagen/pgen.go
index 5b57c8a..e666c22 100644
--- a/src/cmd/compile/internal/ssagen/pgen.go
+++ b/src/cmd/compile/internal/ssagen/pgen.go
@@ -12,9 +12,11 @@
 	"sync"
 
 	"cmd/compile/internal/base"
+	"cmd/compile/internal/inline"
 	"cmd/compile/internal/ir"
 	"cmd/compile/internal/liveness"
 	"cmd/compile/internal/objw"
+	"cmd/compile/internal/pgoir"
 	"cmd/compile/internal/ssa"
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
@@ -296,8 +298,8 @@
 // uses it to generate a plist,
 // and flushes that plist to machine code.
 // worker indicates which of the backend workers is doing the processing.
-func Compile(fn *ir.Func, worker int) {
-	f := buildssa(fn, worker)
+func Compile(fn *ir.Func, worker int, profile *pgoir.Profile) {
+	f := buildssa(fn, worker, inline.IsPgoHotFunc(fn, profile) || inline.HasPgoHotInline(fn))
 	// Note: check arg size to fix issue 25507.
 	if f.Frontend().(*ssafn).stksize >= maxStackSize || f.OwnAux.ArgWidth() >= maxStackSize {
 		largeStackFramesMu.Lock()
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index 9e384fe..9b23935 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -291,7 +291,7 @@
 
 // buildssa builds an SSA function for fn.
 // worker indicates which of the backend workers is doing the processing.
-func buildssa(fn *ir.Func, worker int) *ssa.Func {
+func buildssa(fn *ir.Func, worker int, isPgoHot bool) *ssa.Func {
 	name := ir.FuncName(fn)
 
 	abiSelf := abiForFunc(fn, ssaConfig.ABI0, ssaConfig.ABI1)
@@ -373,6 +373,7 @@
 	// Allocate starting block
 	s.f.Entry = s.f.NewBlock(ssa.BlockPlain)
 	s.f.Entry.Pos = fn.Pos()
+	s.f.IsPgoHot = isPgoHot
 
 	if printssa {
 		ssaDF := ssaDumpFile
@@ -7302,12 +7303,47 @@
 
 	var argLiveIdx int = -1 // argument liveness info index
 
+	// These control cache line alignment; if the required portion of
+	// a cache line is not available, then pad to obtain cache line
+	// alignment.  Not implemented on all architectures, may not be
+	// useful on all architectures.
+	var hotAlign, hotRequire int64
+
+	if base.Debug.AlignHot > 0 {
+		switch base.Ctxt.Arch.Name {
+		// enable this on a case-by-case basis, with benchmarking.
+		// currently shown:
+		//   good for amd64
+		//   not helpful for Apple Silicon
+		//
+		case "amd64", "386":
+			// Align to 64 if 31 or fewer bytes remain in a cache line
+			// benchmarks a little better than always aligning, and also
+			// adds slightly less to the (PGO-compiled) binary size.
+			hotAlign = 64
+			hotRequire = 31
+		}
+	}
+
 	// Emit basic blocks
 	for i, b := range f.Blocks {
-		s.bstart[b.ID] = s.pp.Next
+
 		s.lineRunStart = nil
 		s.SetPos(s.pp.Pos.WithNotStmt()) // It needs a non-empty Pos, but cannot be a statement boundary (yet).
 
+		if hotAlign > 0 && b.Hotness&ssa.HotPgoInitial == ssa.HotPgoInitial {
+			// So far this has only been shown profitable for PGO-hot loop headers.
+			// The Hotness values allows distinctions betwen initial blocks that are "hot" or not, and "flow-in" or not.
+			// Currently only the initial blocks of loops are tagged in this way;
+			// there are no blocks tagged "pgo-hot" that are not also tagged "initial".
+			// TODO more heuristics, more architectures.
+			p := s.pp.Prog(obj.APCALIGNMAX)
+			p.From.SetConst(hotAlign)
+			p.To.SetConst(hotRequire)
+		}
+
+		s.bstart[b.ID] = s.pp.Next
+
 		if idx, ok := argLiveBlockMap[b.ID]; ok && idx != argLiveIdx {
 			argLiveIdx = idx
 			p := s.pp.Prog(obj.APCDATA)
@@ -7466,7 +7502,8 @@
 		// going to emit anyway, and use those instructions instead of the
 		// inline marks.
 		for p := s.pp.Text; p != nil; p = p.Link {
-			if p.As == obj.ANOP || p.As == obj.AFUNCDATA || p.As == obj.APCDATA || p.As == obj.ATEXT || p.As == obj.APCALIGN || Arch.LinkArch.Family == sys.Wasm {
+			if p.As == obj.ANOP || p.As == obj.AFUNCDATA || p.As == obj.APCDATA || p.As == obj.ATEXT ||
+				p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX || Arch.LinkArch.Family == sys.Wasm {
 				// Don't use 0-sized instructions as inline marks, because we need
 				// to identify inline mark instructions by pc offset.
 				// (Some of these instructions are sometimes zero-sized, sometimes not.
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index 03f0fb0..c6601cb 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -889,9 +889,10 @@
 	{obj.ANOP, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // nop variants, see #40689
 	{obj.ANOP, C_ZREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},
 	{obj.ANOP, C_VREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},
-	{obj.ADUFFZERO, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL
-	{obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL
-	{obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},  // align code
+	{obj.ADUFFZERO, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0},   // same as AB/ABL
+	{obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0},   // same as AB/ABL
+	{obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},    // align code
+	{obj.APCALIGNMAX, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0, 0}, // align code, conditional
 }
 
 // Valid pstate field values, and value to use in instruction.
@@ -1109,13 +1110,8 @@
 		m = o.size(c.ctxt, p)
 		if m == 0 {
 			switch p.As {
-			case obj.APCALIGN:
-				alignedValue := p.From.Offset
-				m = pcAlignPadLength(ctxt, pc, alignedValue)
-				// Update the current text symbol alignment value.
-				if int32(alignedValue) > cursym.Func().Align {
-					cursym.Func().Align = int32(alignedValue)
-				}
+			case obj.APCALIGN, obj.APCALIGNMAX:
+				m = obj.AlignmentPadding(int32(pc), p, ctxt, cursym)
 				break
 			case obj.ANOP, obj.AFUNCDATA, obj.APCDATA:
 				continue
@@ -1181,9 +1177,8 @@
 
 			if m == 0 {
 				switch p.As {
-				case obj.APCALIGN:
-					alignedValue := p.From.Offset
-					m = pcAlignPadLength(ctxt, pc, alignedValue)
+				case obj.APCALIGN, obj.APCALIGNMAX:
+					m = obj.AlignmentPaddingLength(int32(pc), p, ctxt)
 					break
 				case obj.ANOP, obj.AFUNCDATA, obj.APCDATA:
 					continue
@@ -1214,9 +1209,8 @@
 		if sz > 4*len(out) {
 			log.Fatalf("out array in span7 is too small, need at least %d for %v", sz/4, p)
 		}
-		if p.As == obj.APCALIGN {
-			alignedValue := p.From.Offset
-			v := pcAlignPadLength(c.ctxt, p.Pc, alignedValue)
+		if p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX {
+			v := obj.AlignmentPaddingLength(int32(p.Pc), p, c.ctxt)
 			for i = 0; i < int(v/4); i++ {
 				// emit ANOOP instruction by the padding size
 				c.ctxt.Arch.ByteOrder.PutUint32(bp, OP_NOOP)
@@ -3316,6 +3310,7 @@
 			obj.AUNDEF,
 			obj.AFUNCDATA,
 			obj.APCALIGN,
+			obj.APCALIGNMAX,
 			obj.APCDATA,
 			obj.ADUFFZERO,
 			obj.ADUFFCOPY:
diff --git a/src/cmd/internal/obj/link.go b/src/cmd/internal/obj/link.go
index 3ebaa2a..38869f0 100644
--- a/src/cmd/internal/obj/link.go
+++ b/src/cmd/internal/obj/link.go
@@ -416,6 +416,7 @@
 	AJMP
 	ANOP
 	APCALIGN
+	APCALIGNMAX // currently x86, amd64 and arm64
 	APCDATA
 	ARET
 	AGETCALLERPC
diff --git a/src/cmd/internal/obj/util.go b/src/cmd/internal/obj/util.go
index 3a071c2..0512d78 100644
--- a/src/cmd/internal/obj/util.go
+++ b/src/cmd/internal/obj/util.go
@@ -6,6 +6,7 @@
 
 import (
 	"bytes"
+	"cmd/internal/objabi"
 	"fmt"
 	"internal/abi"
 	"internal/buildcfg"
@@ -642,6 +643,7 @@
 	"JMP",
 	"NOP",
 	"PCALIGN",
+	"PCALIGNMAX",
 	"PCDATA",
 	"RET",
 	"GETCALLERPC",
@@ -667,3 +669,62 @@
 	}
 	return fmt.Sprintf("<%s>", a.Sym.ABI())
 }
+
+// AlignmentPadding bytes to add to align code as requested.
+// Alignment is restricted to powers of 2 between 8 and 2048 inclusive.
+//
+// pc_: current offset in function, in bytes
+// p:  a PCALIGN or PCALIGNMAX prog
+// ctxt: the context, for current function
+// cursym: current function being assembled
+// returns number of bytes of padding needed,
+// updates minimum alignment for the function.
+func AlignmentPadding(pc int32, p *Prog, ctxt *Link, cursym *LSym) int {
+	v := AlignmentPaddingLength(pc, p, ctxt)
+	requireAlignment(p.From.Offset, ctxt, cursym)
+	return v
+}
+
+// AlignmentPaddingLength is the number of bytes to add to align code as requested.
+// Alignment is restricted to powers of 2 between 8 and 2048 inclusive.
+// This only computes the length and does not update the (missing parameter)
+// current function's own required alignment.
+//
+// pc: current offset in function, in bytes
+// p:  a PCALIGN or PCALIGNMAX prog
+// ctxt: the context, for current function
+// returns number of bytes of padding needed,
+func AlignmentPaddingLength(pc int32, p *Prog, ctxt *Link) int {
+	a := p.From.Offset
+	if !((a&(a-1) == 0) && 8 <= a && a <= 2048) {
+		ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a)
+		return 0
+	}
+	pc64 := int64(pc)
+	lob := pc64 & (a - 1) // Low Order Bits -- if not zero, then not aligned
+	if p.As == APCALIGN {
+		if lob != 0 {
+			return int(a - lob)
+		}
+		return 0
+	}
+	// emit as many as s bytes of padding to obtain alignment
+	s := p.To.Offset
+	if s < 0 || s >= a {
+		ctxt.Diag("PCALIGNMAX 'amount' %d must be non-negative and smaller than the aligment %d\n", s, a)
+		return 0
+	}
+	if s >= a-lob {
+		return int(a - lob)
+	}
+	return 0
+}
+
+// requireAlignment ensures that the function is aligned enough to support
+// the required code alignment
+func requireAlignment(a int64, ctxt *Link, cursym *LSym) {
+	// TODO remove explicit knowledge about AIX.
+	if ctxt.Headtype != objabi.Haix && cursym.Func().Align < int32(a) {
+		cursym.Func().Align = int32(a)
+	}
+}
diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go
index bdd75b4..dc38069 100644
--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@@ -2036,29 +2036,21 @@
 	n int32     // Size of the pad
 }
 
-// Padding bytes to add to align code as requested.
-// Alignment is restricted to powers of 2 between 8 and 2048 inclusive.
+// requireAlignment ensures that the function alignment is at
+// least as high as a, which should be a power of two
+// and between 8 and 2048, inclusive.
 //
-// pc: current offset in function, in bytes
-// a: requested alignment, in bytes
-// cursym: current function being assembled
-// returns number of bytes of padding needed
-func addpad(pc, a int64, ctxt *obj.Link, cursym *obj.LSym) int {
+// the boolean result indicates whether the alignment meets those constraints
+func requireAlignment(a int64, ctxt *obj.Link, cursym *obj.LSym) bool {
 	if !((a&(a-1) == 0) && 8 <= a && a <= 2048) {
 		ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a)
-		return 0
+		return false
 	}
-
 	// By default function alignment is 32 bytes for amd64
 	if cursym.Func().Align < int32(a) {
 		cursym.Func().Align = int32(a)
 	}
-
-	if pc&(a-1) != 0 {
-		return int(a - (pc & (a - 1)))
-	}
-
-	return 0
+	return true
 }
 
 func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
@@ -2144,17 +2136,17 @@
 			c0 := c
 			c = pjc.padJump(ctxt, s, p, c)
 
-			if p.As == obj.APCALIGN {
-				aln := p.From.Offset
-				v := addpad(int64(c), aln, ctxt, s)
+			if p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX {
+				v := obj.AlignmentPadding(c, p, ctxt, s)
 				if v > 0 {
 					s.Grow(int64(c) + int64(v))
 					fillnop(s.P[c:], int(v))
 				}
-
+				p.Pc = int64(c)
 				c += int32(v)
 				pPrev = p
 				continue
+
 			}
 
 			if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 {