runtime: use QPC for nanotime and time.now on windows/arm

The previous implementation of nanotime and time.now used a time source
that was updated on the system clock tick, which has a maximum
resolution of about 1ms. On 386 and amd64, this time source maps to
the system performance counter, so has much higher resolution.
On ARM, use QueryPerformanceCounter() to get a high resolution timestamp.

Updates #26148

Change-Id: I1abc99baf927a95b472ac05020a7788626c71d08
Reviewed-on: https://go-review.googlesource.com/c/154758
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 9b34589..20fe01c 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -198,9 +198,12 @@
 	}
 	_NtWaitForSingleObject = windowsFindfunc(n32, []byte("NtWaitForSingleObject\000"))
 
-	if windowsFindfunc(n32, []byte("wine_get_version\000")) != nil {
-		// running on Wine
-		initWine(k32)
+	underWine := windowsFindfunc(n32, []byte("wine_get_version\000")) != nil
+	if underWine || GOARCH == "arm" {
+		initQPC(k32)
+	}
+	if underWine {
+		initWine()
 	}
 }
 
@@ -357,7 +360,7 @@
 	return
 }
 
-func initWine(k32 uintptr) {
+func initQPC(k32 uintptr) {
 	_GetSystemTimeAsFileTime = windowsFindfunc(k32, []byte("GetSystemTimeAsFileTime\000"))
 	if _GetSystemTimeAsFileTime == nil {
 		throw("could not find GetSystemTimeAsFileTime() syscall")
@@ -394,7 +397,9 @@
 	// We have to do it this way (or similar), since multiplying QPC counter by 100 millions overflows
 	// int64 and resulted time will always be invalid.
 	qpcMultiplier = int64(timediv(1000000000, qpcFrequency, nil))
+}
 
+func initWine() {
 	useQPCTime = 1
 }
 
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 60be74b..514dc52 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -487,115 +487,12 @@
 	MOVW	R0, (R0)
 	RET
 
-// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
-#define _INTERRUPT_TIME 0x7ffe0008
-#define _SYSTEM_TIME 0x7ffe0014
-#define time_lo 0
-#define time_hi1 4
-#define time_hi2 8
-
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
-	MOVW	$0, R0
-	MOVB	runtime·useQPCTime(SB), R0
-	CMP	$0, R0
-	BNE	useQPC
-	MOVW	$_INTERRUPT_TIME, R3
-loop:
-	MOVW	time_hi1(R3), R1
-	MOVW	time_lo(R3), R0
-	MOVW	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	loop
-
-	// wintime = R1:R0, multiply by 100
-	MOVW	$100, R2
-	MULLU	R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
-	MULA	R1, R2, R4, R4
-
-	// wintime*100 = R4:R3
-	MOVW	R3, ret_lo+0(FP)
-	MOVW	R4, ret_hi+4(FP)
-	RET
-useQPC:
+TEXT runtime·nanotime(SB),NOSPLIT|NOFRAME,$0-8
 	B	runtime·nanotimeQPC(SB)		// tail call
 	RET
 
-TEXT time·now(SB),NOSPLIT,$0-20
-	MOVW    $0, R0
-	MOVB    runtime·useQPCTime(SB), R0
-	CMP	$0, R0
-	BNE	useQPC
-	MOVW	$_INTERRUPT_TIME, R3
-loop:
-	MOVW	time_hi1(R3), R1
-	MOVW	time_lo(R3), R0
-	MOVW	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	loop
-
-	// wintime = R1:R0, multiply by 100
-	MOVW	$100, R2
-	MULLU	R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
-	MULA	R1, R2, R4, R4
-
-	// wintime*100 = R4:R3
-	MOVW	R3, mono+12(FP)
-	MOVW	R4, mono+16(FP)
-
-	MOVW	$_SYSTEM_TIME, R3
-wall:
-	MOVW	time_hi1(R3), R1
-	MOVW	time_lo(R3), R0
-	MOVW	time_hi2(R3), R2
-	CMP	R1, R2
-	BNE	wall
-
-	// w = R1:R0 in 100ns untis
-	// convert to Unix epoch (but still 100ns units)
-	#define delta 116444736000000000
-	SUB.S   $(delta & 0xFFFFFFFF), R0
-	SBC     $(delta >> 32), R1
-
-	// Convert to nSec
-	MOVW    $100, R2
-	MULLU   R0, R2, (R4, R3)    // R4:R3 = R1:R0 * R2
-	MULA    R1, R2, R4, R4
-	// w = R2:R1 in nSec
-	MOVW    R3, R1	      // R4:R3 -> R2:R1
-	MOVW    R4, R2
-
-	// multiply nanoseconds by reciprocal of 10**9 (scaled by 2**61)
-	// to get seconds (96 bit scaled result)
-	MOVW	$0x89705f41, R3		// 2**61 * 10**-9
-	MULLU	R1,R3,(R6,R5)		// R7:R6:R5 = R2:R1 * R3
-	MOVW	$0,R7
-	MULALU	R2,R3,(R7,R6)
-
-	// unscale by discarding low 32 bits, shifting the rest by 29
-	MOVW	R6>>29,R6		// R7:R6 = (R7:R6:R5 >> 61)
-	ORR	R7<<3,R6
-	MOVW	R7>>29,R7
-
-	// subtract (10**9 * sec) from nsec to get nanosecond remainder
-	MOVW	$1000000000, R5	// 10**9
-	MULLU	R6,R5,(R9,R8)   // R9:R8 = R7:R6 * R5
-	MULA	R7,R5,R9,R9
-	SUB.S	R8,R1		// R2:R1 -= R9:R8
-	SBC	R9,R2
-
-	// because reciprocal was a truncated repeating fraction, quotient
-	// may be slightly too small -- adjust to make remainder < 10**9
-	CMP	R5,R1	// if remainder > 10**9
-	SUB.HS	R5,R1   //    remainder -= 10**9
-	ADD.HS	$1,R6	//    sec += 1
-
-	MOVW	R6,sec_lo+0(FP)
-	MOVW	R7,sec_hi+4(FP)
-	MOVW	R1,nsec+8(FP)
-	RET
-useQPC:
-	B	runtime·nanotimeQPC(SB)		// tail call
+TEXT time·now(SB),NOSPLIT|NOFRAME,$0-20
+	B	runtime·nowQPC(SB)		// tail call
 	RET
 
 // save_g saves the g register (R10) into thread local memory