draw: add fast paths for RGBA64Image

Go 1.17 added image.RGBA64Image and draw.RGBA64Image interfaces which
use color.RGBA64 instead of color.Color in order to avoid heap
allocations on pixel operations. Fast paths were added to image/draw
for drawing with these images, but not to x/image/draw. This CL
adds them.

Fixes golang/go#62423

goos: windows
goarch: amd64
pkg: golang.org/x/image/draw
cpu: AMD Ryzen 9 7900X 12-Core Processor
                        │  old.bench   │              new.bench              │
                        │    sec/op    │   sec/op     vs base                │
SimpleScaleCopy-24         7.425µ ± 0%   7.581µ ± 0%   +2.10% (p=0.000 n=20)
SimpleTransformCopy-24     7.473µ ± 1%   7.678µ ± 1%   +2.74% (p=0.000 n=20)
SimpleTransformScale-24    440.7µ ± 0%   442.6µ ± 0%   +0.44% (p=0.012 n=20)
ScaleNNLargeDown-24        155.3µ ± 4%   156.9µ ± 2%        ~ (p=0.495 n=20)
ScaleABLargeDown-24        843.4µ ± 0%   850.7µ ± 1%   +0.87% (p=0.000 n=20)
ScaleBLLargeDown-24        102.7m ± 0%   102.7m ± 1%        ~ (p=0.779 n=20)
ScaleCRLargeDown-24        165.6m ± 0%   168.7m ± 1%   +1.84% (p=0.000 n=20)
ScaleNNDown-24             42.53µ ± 1%   43.00µ ± 1%   +1.10% (p=0.000 n=20)
ScaleABDown-24             149.3µ ± 0%   151.5µ ± 1%   +1.52% (p=0.000 n=20)
ScaleBLDown-24             1.347m ± 1%   1.367m ± 1%   +1.43% (p=0.000 n=20)
ScaleCRDown-24             2.635m ± 1%   2.663m ± 1%   +1.09% (p=0.007 n=20)
ScaleNNUp-24               2.108m ± 0%   2.128m ± 2%   +0.93% (p=0.023 n=20)
ScaleABUp-24               7.493m ± 0%   7.594m ± 1%   +1.35% (p=0.000 n=20)
ScaleBLUp-24               6.105m ± 3%   6.410m ± 3%   +5.01% (p=0.001 n=20)
ScaleCRUp-24              10.297m ± 9%   9.640m ± 3%        ~ (p=0.121 n=20)
ScaleNNSrcRGBA-24          111.3µ ± 1%   108.6µ ± 2%   -2.37% (p=0.000 n=20)
ScaleNNSrcUnif-24          1.410µ ± 1%   1.397µ ± 1%        ~ (p=0.103 n=20)
ScaleNNOverRGBA-24         174.0µ ± 1%   170.5µ ± 2%   -1.98% (p=0.004 n=20)
ScaleNNOverUnif-24         90.17µ ± 1%   90.12µ ± 1%        ~ (p=0.490 n=20)
TformNNSrcRGBA-24          103.4µ ± 1%   105.8µ ± 1%   +2.25% (p=0.000 n=20)
TformNNSrcUnif-24          38.61µ ± 1%   40.80µ ± 1%   +5.69% (p=0.000 n=20)
TformNNOverRGBA-24         150.3µ ± 0%   155.5µ ± 2%   +3.49% (p=0.000 n=20)
TformNNOverUnif-24         35.16µ ± 2%   34.75µ ± 2%   -1.18% (p=0.015 n=20)
ScaleABSrcGray-24          154.5µ ± 2%   151.8µ ± 1%   -1.69% (p=0.006 n=20)
ScaleABSrcNRGBA-24         482.0µ ± 1%   475.9µ ± 1%   -1.26% (p=0.002 n=20)
ScaleABSrcRGBA-24          418.2µ ± 0%   416.0µ ± 1%        ~ (p=0.108 n=20)
ScaleABSrcYCbCr-24         849.4µ ± 0%   845.6µ ± 1%   -0.45% (p=0.015 n=20)
ScaleABSrcRGBA64-24       1616.6µ ± 2%   467.8µ ± 2%  -71.06% (p=0.000 n=20)
ScaleABOverGray-24         152.0µ ± 1%   148.2µ ± 1%   -2.51% (p=0.000 n=20)
ScaleABOverNRGBA-24        511.8µ ± 1%   510.2µ ± 1%        ~ (p=0.640 n=20)
ScaleABOverRGBA-24         480.1µ ± 1%   475.5µ ± 0%   -0.94% (p=0.000 n=20)
ScaleABOverYCbCr-24        861.1µ ± 1%   843.2µ ± 0%   -2.08% (p=0.000 n=20)
ScaleABOverRGBA64-24      1723.2µ ± 2%   538.0µ ± 0%  -68.78% (p=0.000 n=20)
TformABSrcGray-24          148.6µ ± 1%   142.6µ ± 0%   -4.01% (p=0.000 n=20)
TformABSrcNRGBA-24         363.3µ ± 2%   356.2µ ± 0%   -1.95% (p=0.000 n=20)
TformABSrcRGBA-24          301.0µ ± 0%   296.5µ ± 0%   -1.49% (p=0.000 n=20)
TformABSrcYCbCr-24         415.1µ ± 0%   409.1µ ± 0%   -1.45% (p=0.000 n=20)
TformABSrcRGBA64-24       1068.9µ ± 2%   337.7µ ± 1%  -68.41% (p=0.000 n=20)
TformABOverGray-24         146.0µ ± 1%   143.2µ ± 0%   -1.88% (p=0.000 n=20)
TformABOverNRGBA-24        394.6µ ± 1%   389.0µ ± 0%   -1.43% (p=0.000 n=20)
TformABOverRGBA-24         341.0µ ± 1%   338.1µ ± 0%   -0.83% (p=0.001 n=20)
TformABOverYCbCr-24        414.4µ ± 1%   410.6µ ± 1%   -0.91% (p=0.049 n=20)
TformABOverRGBA64-24      1108.7µ ± 2%   389.4µ ± 1%  -64.88% (p=0.000 n=20)
ScaleCRSrcGray-24          4.065m ± 2%   3.979m ± 0%   -2.12% (p=0.000 n=20)
ScaleCRSrcNRGBA-24         13.66m ± 1%   13.41m ± 1%   -1.79% (p=0.007 n=20)
ScaleCRSrcRGBA-24          9.258m ± 3%   9.091m ± 1%   -1.80% (p=0.002 n=20)
ScaleCRSrcYCbCr-24         23.53m ± 0%   22.85m ± 1%   -2.89% (p=0.000 n=20)
ScaleCRSrcRGBA64-24        42.76m ± 2%   13.13m ± 1%  -69.31% (p=0.000 n=20)
ScaleCROverGray-24         4.110m ± 1%   3.973m ± 1%   -3.33% (p=0.000 n=20)
ScaleCROverNRGBA-24        14.74m ± 1%   13.50m ± 0%   -8.41% (p=0.000 n=20)
ScaleCROverRGBA-24         9.504m ± 1%   9.301m ± 1%   -2.14% (p=0.000 n=20)
ScaleCROverYCbCr-24        23.42m ± 1%   22.86m ± 0%   -2.38% (p=0.000 n=20)
ScaleCROverRGBA64-24       43.47m ± 1%   13.07m ± 1%  -69.93% (p=0.000 n=20)
TformCRSrcGray-24          1.253m ± 1%   1.225m ± 0%   -2.24% (p=0.000 n=20)
TformCRSrcNRGBA-24         2.141m ± 2%   2.050m ± 1%   -4.24% (p=0.000 n=20)
TformCRSrcRGBA-24          1.810m ± 1%   1.771m ± 0%   -2.15% (p=0.002 n=20)
TformCRSrcYCbCr-24         2.404m ± 1%   2.403m ± 1%        ~ (p=0.698 n=20)
TformCRSrcRGBA64-24        5.150m ± 1%   2.197m ± 1%  -57.34% (p=0.000 n=20)
TformCROverGray-24         1.251m ± 0%   1.223m ± 0%   -2.23% (p=0.000 n=20)
TformCROverNRGBA-24        2.081m ± 1%   2.037m ± 0%   -2.13% (p=0.000 n=20)
TformCROverRGBA-24         1.809m ± 1%   1.793m ± 2%        ~ (p=0.149 n=20)
TformCROverYCbCr-24        2.444m ± 0%   2.400m ± 1%   -1.82% (p=0.000 n=20)
TformCROverRGBA64-24       5.303m ± 2%   2.221m ± 1%  -58.12% (p=0.000 n=20)
geomean                    930.7µ        804.7µ       -13.54%

                     │     old.bench      │                 new.bench                  │
                     │        B/op        │     B/op       vs base                     │
ScaleNNLargeDown-24         0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABLargeDown-24         0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleBLLargeDown-24       1.407Mi ±  0%     1.407Mi ±  0%         ~ (p=0.283 n=20)
ScaleCRLargeDown-24       2.010Mi ±  0%     2.345Mi ± 14%   +16.67% (p=0.000 n=20)
ScaleNNDown-24              0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABDown-24              0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleBLDown-24            1.704Ki ±  1%     1.721Ki ±  3%         ~ (p=0.109 n=20)
ScaleCRDown-24            3.309Ki ±  2%     3.342Ki ±  2%         ~ (p=0.568 n=20)
ScaleNNUp-24                0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABUp-24                0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleBLUp-24              50.96Ki ±  2%     52.73Ki ±  4%    +3.47% (p=0.002 n=20)
ScaleCRUp-24              86.54Ki ± 14%     79.09Ki ±  2%         ~ (p=0.061 n=20)
ScaleNNSrcRGBA-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleNNSrcUnif-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleNNOverRGBA-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleNNOverUnif-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformNNSrcRGBA-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformNNSrcUnif-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformNNOverRGBA-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformNNOverUnif-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABSrcGray-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABSrcNRGBA-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABSrcRGBA-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABSrcYCbCr-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABSrcRGBA64-24       937.5Ki ±  0%       0.0Ki ±  0%  -100.00% (p=0.000 n=20)
ScaleABOverGray-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABOverNRGBA-24         0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABOverRGBA-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABOverYCbCr-24         0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
ScaleABOverRGBA64-24      937.5Ki ±  0%       0.0Ki ±  0%  -100.00% (p=0.000 n=20)
TformABSrcGray-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABSrcNRGBA-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABSrcRGBA-24           0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABSrcYCbCr-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABSrcRGBA64-24       600.8Ki ±  0%       0.0Ki ±  0%  -100.00% (p=0.000 n=20)
TformABOverGray-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABOverNRGBA-24         0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABOverRGBA-24          0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABOverYCbCr-24         0.000 ±  0%       0.000 ±  0%         ~ (p=1.000 n=20) ¹
TformABOverRGBA64-24      600.8Ki ±  0%       0.0Ki ±  0%  -100.00% (p=0.000 n=20)
ScaleCRSrcGray-24         16.51Ki ±  1%     15.97Ki ±  1%    -3.32% (p=0.000 n=20)
ScaleCRSrcNRGBA-24        59.33Ki ±  5%     57.21Ki ±  4%         ~ (p=0.142 n=20)
ScaleCRSrcRGBA-24         37.55Ki ±  2%     36.41Ki ±  2%    -3.03% (p=0.001 n=20)
ScaleCRSrcYCbCr-24        98.08Ki ±  2%     98.08Ki ±  2%         ~ (p=0.952 n=20)
ScaleCRSrcRGBA64-24    24624.95Ki ±  0%     55.21Ki ±  4%   -99.78% (p=0.000 n=20)
ScaleCROverGray-24        16.46Ki ±  1%     15.97Ki ±  1%    -2.99% (p=0.000 n=20)
ScaleCROverNRGBA-24       61.62Ki ±  3%     59.70Ki ±  5%    -3.10% (p=0.015 n=20)
ScaleCROverRGBA-24        38.76Ki ±  2%     37.55Ki ±  1%    -3.12% (p=0.000 n=20)
ScaleCROverYCbCr-24       98.08Ki ±  2%     98.08Ki ±  2%         ~ (p=0.232 n=20)
ScaleCROverRGBA64-24   24624.95Ki ±  0%     55.21Ki ±  2%   -99.78% (p=0.000 n=20)
TformCRSrcGray-24           96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCRSrcNRGBA-24          96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCRSrcRGBA-24           96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCRSrcYCbCr-24          96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCRSrcRGBA64-24    2396479.00 ±  0%       96.00 ±  0%  -100.00% (p=0.000 n=20)
TformCROverGray-24          96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCROverNRGBA-24         96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCROverRGBA-24          96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCROverYCbCr-24         96.00 ±  0%       96.00 ±  0%         ~ (p=1.000 n=20) ¹
TformCROverRGBA64-24   2396479.00 ±  0%       96.00 ±  0%  -100.00% (p=0.000 n=20)
geomean                                 ²                  ?                       ² ³
¹ all samples are equal
² summaries must be >0 to compute geomean
³ ratios must be >0 to compute geomean

                     │     old.bench     │               new.bench                │
                     │     allocs/op     │  allocs/op   vs base                   │
ScaleNNLargeDown-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABLargeDown-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleBLLargeDown-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRLargeDown-24         0.000 ± 0%      1.000 ±  ?         ? (p=0.000 n=20)
ScaleNNDown-24              0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABDown-24              0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleBLDown-24              0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRDown-24              0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleNNUp-24                0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABUp-24                0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleBLUp-24                0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRUp-24                0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleNNSrcRGBA-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleNNSrcUnif-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleNNOverRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleNNOverUnif-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformNNSrcRGBA-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformNNSrcUnif-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformNNOverRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformNNOverUnif-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABSrcGray-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABSrcNRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABSrcRGBA-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABSrcYCbCr-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABSrcRGBA64-24        120.0k ± 0%       0.0k ± 0%  -100.00% (p=0.000 n=20)
ScaleABOverGray-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABOverNRGBA-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABOverRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABOverYCbCr-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleABOverRGBA64-24       120.0k ± 0%       0.0k ± 0%  -100.00% (p=0.000 n=20)
TformABSrcGray-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABSrcNRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABSrcRGBA-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABSrcYCbCr-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABSrcRGBA64-24        76.90k ± 0%      0.00k ± 0%  -100.00% (p=0.000 n=20)
TformABOverGray-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABOverNRGBA-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABOverRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABOverYCbCr-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
TformABOverRGBA64-24       76.90k ± 0%      0.00k ± 0%  -100.00% (p=0.000 n=20)
ScaleCRSrcGray-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRSrcNRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRSrcRGBA-24           0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRSrcYCbCr-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCRSrcRGBA64-24        3.129M ± 0%     0.000M ± 0%  -100.00% (p=0.000 n=20)
ScaleCROverGray-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCROverNRGBA-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCROverRGBA-24          0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCROverYCbCr-24         0.000 ± 0%      0.000 ± 0%         ~ (p=1.000 n=20) ¹
ScaleCROverRGBA64-24       3.129M ± 0%     0.000M ± 0%  -100.00% (p=0.000 n=20)
TformCRSrcGray-24           2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCRSrcNRGBA-24          2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCRSrcRGBA-24           2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCRSrcYCbCr-24          2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCRSrcRGBA64-24    299549.000 ± 0%      2.000 ± 0%  -100.00% (p=0.000 n=20)
TformCROverGray-24          2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCROverNRGBA-24         2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCROverRGBA-24          2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCROverYCbCr-24         2.000 ± 0%      2.000 ± 0%         ~ (p=1.000 n=20) ¹
TformCROverRGBA64-24   299549.000 ± 0%      2.000 ± 0%  -100.00% (p=0.000 n=20)
geomean                                ²                ?                       ²
¹ all samples are equal
² summaries must be >0 to compute geomean

Change-Id: I37778e925cce13c4fec65c9e6d57e205440e2a06
Reviewed-on: https://go-review.googlesource.com/c/image/+/525255
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <nigeltao@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/draw/gen.go b/draw/gen.go
index 33678ad..1646932 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -12,7 +12,6 @@
 	"flag"
 	"fmt"
 	"go/format"
-	"io/ioutil"
 	"log"
 	"os"
 	"strings"
@@ -45,7 +44,7 @@
 	if err != nil {
 		log.Fatal(err)
 	}
-	if err := ioutil.WriteFile("impl.go", out, 0660); err != nil {
+	if err := os.WriteFile("impl.go", out, 0660); err != nil {
 		log.Fatal(err)
 	}
 }
@@ -62,7 +61,9 @@
 		{"*image.RGBA", "*image.NRGBA"},
 		{"*image.RGBA", "*image.RGBA"},
 		{"*image.RGBA", "*image.YCbCr"},
+		{"*image.RGBA", "image.RGBA64Image"},
 		{"*image.RGBA", "image.Image"},
+		{"RGBA64Image", "image.RGBA64Image"},
 		{"Image", "image.Image"},
 	}
 	dTypes, sTypes  []string
@@ -234,13 +235,21 @@
 			return ";"
 		case "Image":
 			s := ""
-			if d.sType == "image.Image" {
+			if d.sType == "image.Image" || d.sType == "image.RGBA64Image" {
 				s = "srcMask, smp := opts.SrcMask, opts.SrcMaskP\n"
 			}
 			return s +
 				"dstMask, dmp := opts.DstMask, opts.DstMaskP\n" +
 				"dstColorRGBA64 := &color.RGBA64{}\n" +
 				"dstColor := color.Color(dstColorRGBA64)"
+		case "RGBA64Image":
+			s := ""
+			if d.sType == "image.Image" || d.sType == "image.RGBA64Image" {
+				s = "srcMask, smp := opts.SrcMask, opts.SrcMaskP\n"
+			}
+			return s +
+				"dstMask, dmp := opts.DstMask, opts.DstMaskP\n" +
+				"dstColorRGBA64 := color.RGBA64{}\n"
 		}
 
 	case "preInner":
@@ -255,7 +264,7 @@
 		switch d.sType {
 		default:
 			return ";"
-		case "image.Image":
+		case "image.Image", "image.RGBA64Image":
 			return "srcMask, smp := opts.SrcMask, opts.SrcMaskP"
 		}
 
@@ -334,6 +343,10 @@
 				"$0g := uint32($1g)\n"+
 				"$0b := uint32($1b)",
 			)
+		case "image.RGBA64Image":
+			return argf(args, ""+
+				"$0 := color.RGBA64{uint16($1r), uint16($1g), uint16($1b), uint16($1a)}",
+			)
 		}
 
 	case "outputu":
@@ -364,14 +377,62 @@
 					"dstColorRGBA64.A = uint16(qa*$2a1/0xffff + $2a)\n"+
 					"dst.Set($0, $1, dstColor)",
 				)
+			case "RGBA64Image":
+				switch d.sType {
+				default:
+					return argf(args, ""+
+						"q := dst.RGBA64At($0, $1)\n"+
+						"if dstMask != nil {\n"+
+						"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+						"	$2r = $2r * ma / 0xffff\n"+
+						"	$2g = $2g * ma / 0xffff\n"+
+						"	$2b = $2b * ma / 0xffff\n"+
+						"	$2a = $2a * ma / 0xffff\n"+
+						"}\n"+
+						"$2a1 := 0xffff - $2a\n"+
+						"dstColorRGBA64.R = uint16(uint32(q.R)*$2a1/0xffff + $2r)\n"+
+						"dstColorRGBA64.G = uint16(uint32(q.G)*$2a1/0xffff + $2g)\n"+
+						"dstColorRGBA64.B = uint16(uint32(q.B)*$2a1/0xffff + $2b)\n"+
+						"dstColorRGBA64.A = uint16(uint32(q.A)*$2a1/0xffff + $2a)\n"+
+						"dst.Set($0, $1, dstColorRGBA64)",
+					)
+				case "image.RGBA64Image":
+					return argf(args, ""+
+						"q := dst.RGBA64At($0, $1)\n"+
+						"if dstMask != nil {\n"+
+						"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+						"	$2.R = uint16(uint32($2.R) * ma / 0xffff)\n"+
+						"	$2.G = uint16(uint32($2.G) * ma / 0xffff)\n"+
+						"	$2.B = uint16(uint32($2.B) * ma / 0xffff)\n"+
+						"	$2.A = uint16(uint32($2.A) * ma / 0xffff)\n"+
+						"}\n"+
+						"$2a1 := 0xffff - uint32($2.A)\n"+
+						"dstColorRGBA64.R = uint16(uint32(q.R)*$2a1/0xffff + uint32($2.R))\n"+
+						"dstColorRGBA64.G = uint16(uint32(q.G)*$2a1/0xffff + uint32($2.G))\n"+
+						"dstColorRGBA64.B = uint16(uint32(q.B)*$2a1/0xffff + uint32($2.B))\n"+
+						"dstColorRGBA64.A = uint16(uint32(q.A)*$2a1/0xffff + uint32($2.A))\n"+
+						"dst.Set($0, $1, dstColorRGBA64)",
+					)
+				}
 			case "*image.RGBA":
-				return argf(args, ""+
-					"$2a1 := (0xffff - $2a) * 0x101\n"+
-					"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*$2a1/0xffff + $2r) >> 8)\n"+
-					"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*$2a1/0xffff + $2g) >> 8)\n"+
-					"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*$2a1/0xffff + $2b) >> 8)\n"+
-					"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*$2a1/0xffff + $2a) >> 8)",
-				)
+				switch d.sType {
+				default:
+					return argf(args, ""+
+						"$2a1 := (0xffff - $2a) * 0x101\n"+
+						"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*$2a1/0xffff + $2r) >> 8)\n"+
+						"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*$2a1/0xffff + $2g) >> 8)\n"+
+						"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*$2a1/0xffff + $2b) >> 8)\n"+
+						"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*$2a1/0xffff + $2a) >> 8)",
+					)
+				case "image.RGBA64Image":
+					return argf(args, ""+
+						"$2a1 := (0xffff - uint32($2.A)) * 0x101\n"+
+						"dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*$2a1/0xffff + uint32($2.R)) >> 8)\n"+
+						"dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*$2a1/0xffff + uint32($2.G)) >> 8)\n"+
+						"dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*$2a1/0xffff + uint32($2.B)) >> 8)\n"+
+						"dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*$2a1/0xffff + uint32($2.A)) >> 8)",
+					)
+				}
 			}
 
 		case "Src":
@@ -401,6 +462,51 @@
 					"	dst.Set($0, $1, dstColor)\n"+
 					"}",
 				)
+			case "RGBA64Image":
+				switch d.sType {
+				default:
+					return argf(args, ""+
+						"if dstMask != nil {\n"+
+						"	q := dst.RGBA64At($0, $1)\n"+
+						"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+						"	pr = pr * ma / 0xffff\n"+
+						"	pg = pg * ma / 0xffff\n"+
+						"	pb = pb * ma / 0xffff\n"+
+						"	pa = pa * ma / 0xffff\n"+
+						"	$2a1 := 0xffff - ma\n"+ // Note that this is ma, not $2a.
+						"	dstColorRGBA64.R = uint16(uint32(q.R)*$2a1/0xffff + $2r)\n"+
+						"	dstColorRGBA64.G = uint16(uint32(q.G)*$2a1/0xffff + $2g)\n"+
+						"	dstColorRGBA64.B = uint16(uint32(q.B)*$2a1/0xffff + $2b)\n"+
+						"	dstColorRGBA64.A = uint16(uint32(q.A)*$2a1/0xffff + $2a)\n"+
+						"	dst.Set($0, $1, dstColorRGBA64)\n"+
+						"} else {\n"+
+						"	dstColorRGBA64.R = uint16($2r)\n"+
+						"	dstColorRGBA64.G = uint16($2g)\n"+
+						"	dstColorRGBA64.B = uint16($2b)\n"+
+						"	dstColorRGBA64.A = uint16($2a)\n"+
+						"	dst.Set($0, $1, dstColorRGBA64)\n"+
+						"}",
+					)
+				case "image.RGBA64Image":
+					return argf(args, ""+
+						"if dstMask != nil {\n"+
+						"	q := dst.RGBA64At($0, $1)\n"+
+						"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+						"	p.R = uint16(uint32(p.R) * ma / 0xffff)\n"+
+						"	p.G = uint16(uint32(p.G) * ma / 0xffff)\n"+
+						"	p.B = uint16(uint32(p.B) * ma / 0xffff)\n"+
+						"	p.A = uint16(uint32(p.A) * ma / 0xffff)\n"+
+						"	$2a1 := 0xffff - ma\n"+ // Note that this is ma, not $2a.
+						"	dstColorRGBA64.R = uint16(uint32(q.R)*$2a1/0xffff + uint32($2.R))\n"+
+						"	dstColorRGBA64.G = uint16(uint32(q.G)*$2a1/0xffff + uint32($2.G))\n"+
+						"	dstColorRGBA64.B = uint16(uint32(q.B)*$2a1/0xffff + uint32($2.B))\n"+
+						"	dstColorRGBA64.A = uint16(uint32(q.A)*$2a1/0xffff + uint32($2.A))\n"+
+						"	dst.Set($0, $1, dstColorRGBA64)\n"+
+						"} else {\n"+
+						"	dst.Set($0, $1, $2)\n"+
+						"}",
+					)
+				}
 			case "*image.RGBA":
 				switch d.sType {
 				default:
@@ -425,6 +531,13 @@
 						"dst.Pix[d+2] = uint8($2b >> 8)\n"+
 						"dst.Pix[d+3] = 0xff",
 					)
+				case "image.RGBA64Image":
+					return argf(args, ""+
+						"dst.Pix[d+0] = uint8($2.R >> 8)\n"+
+						"dst.Pix[d+1] = uint8($2.G >> 8)\n"+
+						"dst.Pix[d+2] = uint8($2.B >> 8)\n"+
+						"dst.Pix[d+3] = uint8($2.A >> 8)",
+					)
 				}
 			}
 		}
@@ -462,6 +575,27 @@
 					"dstColorRGBA64.A = uint16(qa*$3a1/0xffff + $3a0)\n"+
 					"dst.Set($0, $1, dstColor)",
 				)
+			case "RGBA64Image":
+				ret = argf(args, ""+
+					"q := dst.RGBA64At($0, $1)\n"+
+					"$3r0 := uint32($2($3r * $4))\n"+
+					"$3g0 := uint32($2($3g * $4))\n"+
+					"$3b0 := uint32($2($3b * $4))\n"+
+					"$3a0 := uint32($2($3a * $4))\n"+
+					"if dstMask != nil {\n"+
+					"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+					"	$3r0 = $3r0 * ma / 0xffff\n"+
+					"	$3g0 = $3g0 * ma / 0xffff\n"+
+					"	$3b0 = $3b0 * ma / 0xffff\n"+
+					"	$3a0 = $3a0 * ma / 0xffff\n"+
+					"}\n"+
+					"$3a1 := 0xffff - $3a0\n"+
+					"dstColorRGBA64.R = uint16(uint32(q.R)*$3a1/0xffff + $3r0)\n"+
+					"dstColorRGBA64.G = uint16(uint32(q.G)*$3a1/0xffff + $3g0)\n"+
+					"dstColorRGBA64.B = uint16(uint32(q.B)*$3a1/0xffff + $3b0)\n"+
+					"dstColorRGBA64.A = uint16(uint32(q.A)*$3a1/0xffff + $3a0)\n"+
+					"dst.SetRGBA64($0, $1, dstColorRGBA64)",
+				)
 			case "*image.RGBA":
 				ret = argf(args, ""+
 					"$3r0 := uint32($2($3r * $4))\n"+
@@ -503,6 +637,29 @@
 					"	dst.Set($0, $1, dstColor)\n"+
 					"}",
 				)
+			case "RGBA64Image":
+				ret = argf(args, ""+
+					"if dstMask != nil {\n"+
+					"	q := dst.RGBA64At($0, $1)\n"+
+					"	_, _, _, ma := dstMask.At(dmp.X + $0, dmp.Y + $1).RGBA()\n"+
+					"	pr := uint32($2($3r * $4)) * ma / 0xffff\n"+
+					"	pg := uint32($2($3g * $4)) * ma / 0xffff\n"+
+					"	pb := uint32($2($3b * $4)) * ma / 0xffff\n"+
+					"	pa := uint32($2($3a * $4)) * ma / 0xffff\n"+
+					"	pa1 := 0xffff - ma\n"+ // Note that this is ma, not pa.
+					"	dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + pr)\n"+
+					"	dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + pg)\n"+
+					"	dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + pb)\n"+
+					"	dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + pa)\n"+
+					"	dst.SetRGBA64($0, $1, dstColorRGBA64)\n"+
+					"} else {\n"+
+					"	dstColorRGBA64.R = $2($3r * $4)\n"+
+					"	dstColorRGBA64.G = $2($3g * $4)\n"+
+					"	dstColorRGBA64.B = $2($3b * $4)\n"+
+					"	dstColorRGBA64.A = $2($3a * $4)\n"+
+					"	dst.SetRGBA64($0, $1, dstColorRGBA64)\n"+
+					"}",
+				)
 			case "*image.RGBA":
 				switch d.sType {
 				default:
@@ -560,7 +717,7 @@
 				"%sr%s, %sg%s, %sb%s, %sa%s := src.At(%s, %s).RGBA()\n",
 				lhs, tmp, lhs, tmp, lhs, tmp, lhs, tmp, args[0], args[1],
 			)
-			if d.dType == "" || d.dType == "Image" {
+			if d.dType == "" || d.dType == "Image" || d.dType == "RGBA64Image" {
 				fmt.Fprintf(buf, ""+
 					"if srcMask != nil {\n"+
 					"	_, _, _, ma := srcMask.At(smp.X+%s, smp.Y+%s).RGBA()\n"+
@@ -576,6 +733,24 @@
 					lhs, tmp, lhs, tmp,
 				)
 			}
+		case "image.RGBA64Image":
+			fmt.Fprintf(buf, ""+
+				"%s%s := src.RGBA64At(%s, %s)\n",
+				lhs, tmp, args[0], args[1],
+			)
+			if d.dType == "" || d.dType == "Image" || d.dType == "RGBA64Image" {
+				fmt.Fprintf(buf, ""+
+					"if srcMask != nil {\n"+
+					"	_, _, _, ma := srcMask.At(smp.X+%[1]s, smp.Y+%[2]s).RGBA()\n"+
+					"	%[3]s%[4]s.R = uint16(uint32(%[3]s%[4]s.R) * ma / 0xffff)\n"+
+					"	%[3]s%[4]s.G = uint16(uint32(%[3]s%[4]s.G) * ma / 0xffff)\n"+
+					"	%[3]s%[4]s.B = uint16(uint32(%[3]s%[4]s.B) * ma / 0xffff)\n"+
+					"	%[3]s%[4]s.A = uint16(uint32(%[3]s%[4]s.A) * ma / 0xffff)\n"+
+					"}\n",
+					args[0], args[1],
+					lhs, tmp,
+				)
+			}
 		case "*image.Gray":
 			fmt.Fprintf(buf, ""+
 				"%si := %s\n"+
@@ -647,6 +822,14 @@
 					lhs, eqOp, lhs, extra,
 					lhs, eqOp, lhs, extra,
 				)
+			case "image.RGBA64Image":
+				fmt.Fprintf(buf, ""+
+					"%[1]sr %[2]s float64(%[3]su.R)%[4]s\n"+
+					"%[1]sg %[2]s float64(%[3]su.G)%[4]s\n"+
+					"%[1]sb %[2]s float64(%[3]su.B)%[4]s\n"+
+					"%[1]sa %[2]s float64(%[3]su.A)%[4]s\n",
+					lhs, eqOp, lhs, extra,
+				)
 			}
 		}
 
diff --git a/draw/impl.go b/draw/impl.go
index 75498ad..94ee826 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -59,9 +59,16 @@
 					z.scale_RGBA_NRGBA_Over(dst, dr, adr, src, sr, &o)
 				case *image.RGBA:
 					z.scale_RGBA_RGBA_Over(dst, dr, adr, src, sr, &o)
+				case image.RGBA64Image:
+					z.scale_RGBA_RGBA64Image_Over(dst, dr, adr, src, sr, &o)
 				default:
 					z.scale_RGBA_Image_Over(dst, dr, adr, src, sr, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.scale_RGBA64Image_RGBA64Image_Over(dst, dr, adr, src, sr, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -91,9 +98,16 @@
 					case image.YCbCrSubsampleRatio440:
 						z.scale_RGBA_YCbCr440_Src(dst, dr, adr, src, sr, &o)
 					}
+				case image.RGBA64Image:
+					z.scale_RGBA_RGBA64Image_Src(dst, dr, adr, src, sr, &o)
 				default:
 					z.scale_RGBA_Image_Src(dst, dr, adr, src, sr, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.scale_RGBA64Image_RGBA64Image_Src(dst, dr, adr, src, sr, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -170,9 +184,16 @@
 					z.transform_RGBA_NRGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
 				case *image.RGBA:
 					z.transform_RGBA_RGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case image.RGBA64Image:
+					z.transform_RGBA_RGBA64Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
 				default:
 					z.transform_RGBA_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.transform_RGBA64Image_RGBA64Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -202,9 +223,16 @@
 					case image.YCbCrSubsampleRatio440:
 						z.transform_RGBA_YCbCr440_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
 					}
+				case image.RGBA64Image:
+					z.transform_RGBA_RGBA64Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
 				default:
 					z.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.transform_RGBA64Image_RGBA64Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -502,6 +530,45 @@
 	}
 }
 
+func (nnInterpolator) scale_RGBA_RGBA64Image_Over(dst *image.RGBA, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			p := src.RGBA64At(sr.Min.X+int(sx), sr.Min.Y+int(sy))
+			pa1 := (0xffff - uint32(p.A)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(p.R)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(p.G)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(p.B)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(p.A)) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA_RGBA64Image_Src(dst *image.RGBA, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			p := src.RGBA64At(sr.Min.X+int(sx), sr.Min.Y+int(sy))
+			dst.Pix[d+0] = uint8(p.R >> 8)
+			dst.Pix[d+1] = uint8(p.G >> 8)
+			dst.Pix[d+2] = uint8(p.B >> 8)
+			dst.Pix[d+3] = uint8(p.A >> 8)
+		}
+	}
+}
+
 func (nnInterpolator) scale_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
 	dw2 := uint64(dr.Dx()) * 2
 	dh2 := uint64(dr.Dy()) * 2
@@ -541,6 +608,86 @@
 	}
 }
 
+func (nnInterpolator) scale_RGBA64Image_RGBA64Image_Over(dst RGBA64Image, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			p := src.RGBA64At(sr.Min.X+int(sx), sr.Min.Y+int(sy))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx), smp.Y+sr.Min.Y+int(sy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			pa1 := 0xffff - uint32(p.A)
+			dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+			dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+			dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+			dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+		}
+	}
+}
+
+func (nnInterpolator) scale_RGBA64Image_RGBA64Image_Src(dst RGBA64Image, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	dw2 := uint64(dr.Dx()) * 2
+	dh2 := uint64(dr.Dy()) * 2
+	sw := uint64(sr.Dx())
+	sh := uint64(sr.Dy())
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (2*uint64(dy) + 1) * sh / dh2
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (2*uint64(dx) + 1) * sw / dw2
+			p := src.RGBA64At(sr.Min.X+int(sx), sr.Min.Y+int(sy))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx), smp.Y+sr.Min.Y+int(sy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			if dstMask != nil {
+				q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+				dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+				dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+				dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+			} else {
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), p)
+			}
+		}
+	}
+}
+
 func (nnInterpolator) scale_Image_Image_Over(dst Image, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
 	dw2 := uint64(dr.Dx()) * 2
 	dh2 := uint64(dr.Dy()) * 2
@@ -921,6 +1068,47 @@
 	}
 }
 
+func (nnInterpolator) transform_RGBA_RGBA64Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			p := src.RGBA64At(sx0, sy0)
+			pa1 := (0xffff - uint32(p.A)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(p.R)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(p.G)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(p.B)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(p.A)) >> 8)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA_RGBA64Image_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			p := src.RGBA64At(sx0, sy0)
+			dst.Pix[d+0] = uint8(p.R >> 8)
+			dst.Pix[d+1] = uint8(p.G >> 8)
+			dst.Pix[d+2] = uint8(p.B >> 8)
+			dst.Pix[d+3] = uint8(p.A >> 8)
+		}
+	}
+}
+
 func (nnInterpolator) transform_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		dyf := float64(dr.Min.Y+int(dy)) + 0.5
@@ -962,6 +1150,88 @@
 	}
 }
 
+func (nnInterpolator) transform_RGBA64Image_RGBA64Image_Over(dst RGBA64Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			p := src.RGBA64At(sx0, sy0)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			pa1 := 0xffff - uint32(p.A)
+			dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+			dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+			dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+			dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+		}
+	}
+}
+
+func (nnInterpolator) transform_RGBA64Image_RGBA64Image_Src(dst RGBA64Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx0 := int(d2s[0]*dxf+d2s[1]*dyf+d2s[2]) + bias.X
+			sy0 := int(d2s[3]*dxf+d2s[4]*dyf+d2s[5]) + bias.Y
+			if !(image.Point{sx0, sy0}).In(sr) {
+				continue
+			}
+			p := src.RGBA64At(sx0, sy0)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			if dstMask != nil {
+				q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+				dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+				dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+				dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+			} else {
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), p)
+			}
+		}
+	}
+}
+
 func (nnInterpolator) transform_Image_Image_Over(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
 	srcMask, smp := opts.SrcMask, opts.SrcMaskP
 	dstMask, dmp := opts.DstMask, opts.DstMaskP
@@ -1097,9 +1367,16 @@
 					z.scale_RGBA_NRGBA_Over(dst, dr, adr, src, sr, &o)
 				case *image.RGBA:
 					z.scale_RGBA_RGBA_Over(dst, dr, adr, src, sr, &o)
+				case image.RGBA64Image:
+					z.scale_RGBA_RGBA64Image_Over(dst, dr, adr, src, sr, &o)
 				default:
 					z.scale_RGBA_Image_Over(dst, dr, adr, src, sr, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.scale_RGBA64Image_RGBA64Image_Over(dst, dr, adr, src, sr, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -1129,9 +1406,16 @@
 					case image.YCbCrSubsampleRatio440:
 						z.scale_RGBA_YCbCr440_Src(dst, dr, adr, src, sr, &o)
 					}
+				case image.RGBA64Image:
+					z.scale_RGBA_RGBA64Image_Src(dst, dr, adr, src, sr, &o)
 				default:
 					z.scale_RGBA_Image_Src(dst, dr, adr, src, sr, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.scale_RGBA64Image_RGBA64Image_Src(dst, dr, adr, src, sr, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -1208,9 +1492,16 @@
 					z.transform_RGBA_NRGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
 				case *image.RGBA:
 					z.transform_RGBA_RGBA_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				case image.RGBA64Image:
+					z.transform_RGBA_RGBA64Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
 				default:
 					z.transform_RGBA_Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.transform_RGBA64Image_RGBA64Image_Over(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -1240,9 +1531,16 @@
 					case image.YCbCrSubsampleRatio440:
 						z.transform_RGBA_YCbCr440_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
 					}
+				case image.RGBA64Image:
+					z.transform_RGBA_RGBA64Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
 				default:
 					z.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					z.transform_RGBA64Image_RGBA64Image_Src(dst, dr, adr, &d2s, src, sr, bias, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -2415,6 +2713,167 @@
 	}
 }
 
+func (ablInterpolator) scale_RGBA_RGBA64Image_Over(dst *image.RGBA, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0))
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1))
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1))
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			pa1 := (0xffff - uint32(p.A)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(p.R)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(p.G)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(p.B)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(p.A)) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA_RGBA64Image_Src(dst *image.RGBA, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0))
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1))
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1))
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			dst.Pix[d+0] = uint8(p.R >> 8)
+			dst.Pix[d+1] = uint8(p.G >> 8)
+			dst.Pix[d+2] = uint8(p.B >> 8)
+			dst.Pix[d+3] = uint8(p.A >> 8)
+		}
+	}
+}
+
 func (ablInterpolator) scale_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
 	sw := int32(sr.Dx())
 	sh := int32(sr.Dy())
@@ -2582,6 +3041,248 @@
 	}
 }
 
+func (ablInterpolator) scale_RGBA64Image_RGBA64Image_Over(dst RGBA64Image, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s00u.R = uint16(uint32(s00u.R) * ma / 0xffff)
+				s00u.G = uint16(uint32(s00u.G) * ma / 0xffff)
+				s00u.B = uint16(uint32(s00u.B) * ma / 0xffff)
+				s00u.A = uint16(uint32(s00u.A) * ma / 0xffff)
+			}
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s10u.R = uint16(uint32(s10u.R) * ma / 0xffff)
+				s10u.G = uint16(uint32(s10u.G) * ma / 0xffff)
+				s10u.B = uint16(uint32(s10u.B) * ma / 0xffff)
+				s10u.A = uint16(uint32(s10u.A) * ma / 0xffff)
+			}
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s01u.R = uint16(uint32(s01u.R) * ma / 0xffff)
+				s01u.G = uint16(uint32(s01u.G) * ma / 0xffff)
+				s01u.B = uint16(uint32(s01u.B) * ma / 0xffff)
+				s01u.A = uint16(uint32(s01u.A) * ma / 0xffff)
+			}
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s11u.R = uint16(uint32(s11u.R) * ma / 0xffff)
+				s11u.G = uint16(uint32(s11u.G) * ma / 0xffff)
+				s11u.B = uint16(uint32(s11u.B) * ma / 0xffff)
+				s11u.A = uint16(uint32(s11u.A) * ma / 0xffff)
+			}
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			pa1 := 0xffff - uint32(p.A)
+			dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+			dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+			dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+			dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+		}
+	}
+}
+
+func (ablInterpolator) scale_RGBA64Image_RGBA64Image_Src(dst RGBA64Image, dr, adr image.Rectangle, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	sw := int32(sr.Dx())
+	sh := int32(sr.Dy())
+	yscale := float64(sh) / float64(dr.Dy())
+	xscale := float64(sw) / float64(dr.Dx())
+	swMinus1, shMinus1 := sw-1, sh-1
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		sy := (float64(dy)+0.5)*yscale - 0.5
+		// If sy < 0, we will clamp sy0 to 0 anyway, so it doesn't matter if
+		// we say int32(sy) instead of int32(math.Floor(sy)). Similarly for
+		// sx, below.
+		sy0 := int32(sy)
+		yFrac0 := sy - float64(sy0)
+		yFrac1 := 1 - yFrac0
+		sy1 := sy0 + 1
+		if sy < 0 {
+			sy0, sy1 = 0, 0
+			yFrac0, yFrac1 = 0, 1
+		} else if sy1 > shMinus1 {
+			sy0, sy1 = shMinus1, shMinus1
+			yFrac0, yFrac1 = 1, 0
+		}
+
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			sx := (float64(dx)+0.5)*xscale - 0.5
+			sx0 := int32(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx1 := sx0 + 1
+			if sx < 0 {
+				sx0, sx1 = 0, 0
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 > swMinus1 {
+				sx0, sx1 = swMinus1, swMinus1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy0))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s00u.R = uint16(uint32(s00u.R) * ma / 0xffff)
+				s00u.G = uint16(uint32(s00u.G) * ma / 0xffff)
+				s00u.B = uint16(uint32(s00u.B) * ma / 0xffff)
+				s00u.A = uint16(uint32(s00u.A) * ma / 0xffff)
+			}
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy0))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy0)).RGBA()
+				s10u.R = uint16(uint32(s10u.R) * ma / 0xffff)
+				s10u.G = uint16(uint32(s10u.G) * ma / 0xffff)
+				s10u.B = uint16(uint32(s10u.B) * ma / 0xffff)
+				s10u.A = uint16(uint32(s10u.A) * ma / 0xffff)
+			}
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sr.Min.X+int(sx0), sr.Min.Y+int(sy1))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx0), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s01u.R = uint16(uint32(s01u.R) * ma / 0xffff)
+				s01u.G = uint16(uint32(s01u.G) * ma / 0xffff)
+				s01u.B = uint16(uint32(s01u.B) * ma / 0xffff)
+				s01u.A = uint16(uint32(s01u.A) * ma / 0xffff)
+			}
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sr.Min.X+int(sx1), sr.Min.Y+int(sy1))
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(sx1), smp.Y+sr.Min.Y+int(sy1)).RGBA()
+				s11u.R = uint16(uint32(s11u.R) * ma / 0xffff)
+				s11u.G = uint16(uint32(s11u.G) * ma / 0xffff)
+				s11u.B = uint16(uint32(s11u.B) * ma / 0xffff)
+				s11u.A = uint16(uint32(s11u.A) * ma / 0xffff)
+			}
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			if dstMask != nil {
+				q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+				dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+				dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+				dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+			} else {
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), p)
+			}
+		}
+	}
+}
+
 func (ablInterpolator) scale_Image_Image_Over(dst Image, dr, adr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
 	sw := int32(sr.Dx())
 	sh := int32(sr.Dy())
@@ -4007,6 +4708,169 @@
 	}
 }
 
+func (ablInterpolator) transform_RGBA_RGBA64Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sx0, sy0)
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sx1, sy0)
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sx0, sy1)
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sx1, sy1)
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			pa1 := (0xffff - uint32(p.A)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + uint32(p.R)) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + uint32(p.G)) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + uint32(p.B)) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + uint32(p.A)) >> 8)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA_RGBA64Image_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sx0, sy0)
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sx1, sy0)
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sx0, sy1)
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sx1, sy1)
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			dst.Pix[d+0] = uint8(p.R >> 8)
+			dst.Pix[d+1] = uint8(p.G >> 8)
+			dst.Pix[d+2] = uint8(p.B >> 8)
+			dst.Pix[d+3] = uint8(p.A >> 8)
+		}
+	}
+}
+
 func (ablInterpolator) transform_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
 	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
 		dyf := float64(dr.Min.Y+int(dy)) + 0.5
@@ -4176,6 +5040,252 @@
 	}
 }
 
+func (ablInterpolator) transform_RGBA64Image_RGBA64Image_Over(dst RGBA64Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sx0, sy0)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				s00u.R = uint16(uint32(s00u.R) * ma / 0xffff)
+				s00u.G = uint16(uint32(s00u.G) * ma / 0xffff)
+				s00u.B = uint16(uint32(s00u.B) * ma / 0xffff)
+				s00u.A = uint16(uint32(s00u.A) * ma / 0xffff)
+			}
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sx1, sy0)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy0).RGBA()
+				s10u.R = uint16(uint32(s10u.R) * ma / 0xffff)
+				s10u.G = uint16(uint32(s10u.G) * ma / 0xffff)
+				s10u.B = uint16(uint32(s10u.B) * ma / 0xffff)
+				s10u.A = uint16(uint32(s10u.A) * ma / 0xffff)
+			}
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sx0, sy1)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy1).RGBA()
+				s01u.R = uint16(uint32(s01u.R) * ma / 0xffff)
+				s01u.G = uint16(uint32(s01u.G) * ma / 0xffff)
+				s01u.B = uint16(uint32(s01u.B) * ma / 0xffff)
+				s01u.A = uint16(uint32(s01u.A) * ma / 0xffff)
+			}
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sx1, sy1)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy1).RGBA()
+				s11u.R = uint16(uint32(s11u.R) * ma / 0xffff)
+				s11u.G = uint16(uint32(s11u.G) * ma / 0xffff)
+				s11u.B = uint16(uint32(s11u.B) * ma / 0xffff)
+				s11u.A = uint16(uint32(s11u.A) * ma / 0xffff)
+			}
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+			}
+			pa1 := 0xffff - uint32(p.A)
+			dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+			dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+			dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+			dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+			dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+		}
+	}
+}
+
+func (ablInterpolator) transform_RGBA64Image_RGBA64Image_Src(dst RGBA64Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, opts *Options) {
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			sx -= 0.5
+			sx0 := int(sx)
+			xFrac0 := sx - float64(sx0)
+			xFrac1 := 1 - xFrac0
+			sx0 += bias.X
+			sx1 := sx0 + 1
+			if sx0 < sr.Min.X {
+				sx0, sx1 = sr.Min.X, sr.Min.X
+				xFrac0, xFrac1 = 0, 1
+			} else if sx1 >= sr.Max.X {
+				sx0, sx1 = sr.Max.X-1, sr.Max.X-1
+				xFrac0, xFrac1 = 1, 0
+			}
+
+			sy -= 0.5
+			sy0 := int(sy)
+			yFrac0 := sy - float64(sy0)
+			yFrac1 := 1 - yFrac0
+			sy0 += bias.Y
+			sy1 := sy0 + 1
+			if sy0 < sr.Min.Y {
+				sy0, sy1 = sr.Min.Y, sr.Min.Y
+				yFrac0, yFrac1 = 0, 1
+			} else if sy1 >= sr.Max.Y {
+				sy0, sy1 = sr.Max.Y-1, sr.Max.Y-1
+				yFrac0, yFrac1 = 1, 0
+			}
+
+			s00u := src.RGBA64At(sx0, sy0)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy0).RGBA()
+				s00u.R = uint16(uint32(s00u.R) * ma / 0xffff)
+				s00u.G = uint16(uint32(s00u.G) * ma / 0xffff)
+				s00u.B = uint16(uint32(s00u.B) * ma / 0xffff)
+				s00u.A = uint16(uint32(s00u.A) * ma / 0xffff)
+			}
+			s00r := float64(s00u.R)
+			s00g := float64(s00u.G)
+			s00b := float64(s00u.B)
+			s00a := float64(s00u.A)
+			s10u := src.RGBA64At(sx1, sy0)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy0).RGBA()
+				s10u.R = uint16(uint32(s10u.R) * ma / 0xffff)
+				s10u.G = uint16(uint32(s10u.G) * ma / 0xffff)
+				s10u.B = uint16(uint32(s10u.B) * ma / 0xffff)
+				s10u.A = uint16(uint32(s10u.A) * ma / 0xffff)
+			}
+			s10r := float64(s10u.R)
+			s10g := float64(s10u.G)
+			s10b := float64(s10u.B)
+			s10a := float64(s10u.A)
+			s10r = xFrac1*s00r + xFrac0*s10r
+			s10g = xFrac1*s00g + xFrac0*s10g
+			s10b = xFrac1*s00b + xFrac0*s10b
+			s10a = xFrac1*s00a + xFrac0*s10a
+			s01u := src.RGBA64At(sx0, sy1)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx0, smp.Y+sy1).RGBA()
+				s01u.R = uint16(uint32(s01u.R) * ma / 0xffff)
+				s01u.G = uint16(uint32(s01u.G) * ma / 0xffff)
+				s01u.B = uint16(uint32(s01u.B) * ma / 0xffff)
+				s01u.A = uint16(uint32(s01u.A) * ma / 0xffff)
+			}
+			s01r := float64(s01u.R)
+			s01g := float64(s01u.G)
+			s01b := float64(s01u.B)
+			s01a := float64(s01u.A)
+			s11u := src.RGBA64At(sx1, sy1)
+			if srcMask != nil {
+				_, _, _, ma := srcMask.At(smp.X+sx1, smp.Y+sy1).RGBA()
+				s11u.R = uint16(uint32(s11u.R) * ma / 0xffff)
+				s11u.G = uint16(uint32(s11u.G) * ma / 0xffff)
+				s11u.B = uint16(uint32(s11u.B) * ma / 0xffff)
+				s11u.A = uint16(uint32(s11u.A) * ma / 0xffff)
+			}
+			s11r := float64(s11u.R)
+			s11g := float64(s11u.G)
+			s11b := float64(s11u.B)
+			s11a := float64(s11u.A)
+			s11r = xFrac1*s01r + xFrac0*s11r
+			s11g = xFrac1*s01g + xFrac0*s11g
+			s11b = xFrac1*s01b + xFrac0*s11b
+			s11a = xFrac1*s01a + xFrac0*s11a
+			s11r = yFrac1*s10r + yFrac0*s11r
+			s11g = yFrac1*s10g + yFrac0*s11g
+			s11b = yFrac1*s10b + yFrac0*s11b
+			s11a = yFrac1*s10a + yFrac0*s11a
+			p := color.RGBA64{uint16(s11r), uint16(s11g), uint16(s11b), uint16(s11a)}
+			if dstMask != nil {
+				q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				p.R = uint16(uint32(p.R) * ma / 0xffff)
+				p.G = uint16(uint32(p.G) * ma / 0xffff)
+				p.B = uint16(uint32(p.B) * ma / 0xffff)
+				p.A = uint16(uint32(p.A) * ma / 0xffff)
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + uint32(p.R))
+				dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + uint32(p.G))
+				dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + uint32(p.B))
+				dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + uint32(p.A))
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+			} else {
+				dst.Set(dr.Min.X+int(dx), dr.Min.Y+int(dy), p)
+			}
+		}
+	}
+}
+
 func (ablInterpolator) transform_Image_Image_Over(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, opts *Options) {
 	srcMask, smp := opts.SrcMask, opts.SrcMaskP
 	dstMask, dmp := opts.DstMask, opts.DstMaskP
@@ -4500,6 +5610,8 @@
 			case image.YCbCrSubsampleRatio440:
 				z.scaleX_YCbCr440(tmp, src, sr, &o)
 			}
+		case image.RGBA64Image:
+			z.scaleX_RGBA64Image(tmp, src, sr, &o)
 		default:
 			z.scaleX_Image(tmp, src, sr, &o)
 		}
@@ -4518,6 +5630,8 @@
 			switch dst := dst.(type) {
 			case *image.RGBA:
 				z.scaleY_RGBA_Over(dst, dr, adr, tmp, &o)
+			case RGBA64Image:
+				z.scaleY_RGBA64Image_Over(dst, dr, adr, tmp, &o)
 			default:
 				z.scaleY_Image_Over(dst, dr, adr, tmp, &o)
 			}
@@ -4525,6 +5639,8 @@
 			switch dst := dst.(type) {
 			case *image.RGBA:
 				z.scaleY_RGBA_Src(dst, dr, adr, tmp, &o)
+			case RGBA64Image:
+				z.scaleY_RGBA64Image_Src(dst, dr, adr, tmp, &o)
 			default:
 				z.scaleY_Image_Src(dst, dr, adr, tmp, &o)
 			}
@@ -4600,9 +5716,16 @@
 					q.transform_RGBA_NRGBA_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
 				case *image.RGBA:
 					q.transform_RGBA_RGBA_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				case image.RGBA64Image:
+					q.transform_RGBA_RGBA64Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
 				default:
 					q.transform_RGBA_Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					q.transform_RGBA64Image_RGBA64Image_Over(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -4632,9 +5755,16 @@
 					case image.YCbCrSubsampleRatio440:
 						q.transform_RGBA_YCbCr440_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
 					}
+				case image.RGBA64Image:
+					q.transform_RGBA_RGBA64Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
 				default:
 					q.transform_RGBA_Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
 				}
+			case RGBA64Image:
+				switch src := src.(type) {
+				case image.RGBA64Image:
+					q.transform_RGBA64Image_RGBA64Image_Src(dst, dr, adr, &d2s, src, sr, bias, xscale, yscale, &o)
+				}
 			default:
 				switch src := src.(type) {
 				default:
@@ -4909,6 +6039,37 @@
 	}
 }
 
+func (z *kernelScaler) scaleX_RGBA64Image(tmp [][4]float64, src image.RGBA64Image, sr image.Rectangle, opts *Options) {
+	t := 0
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	for y := int32(0); y < z.sh; y++ {
+		for _, s := range z.horizontal.sources {
+			var pr, pg, pb, pa float64
+			for _, c := range z.horizontal.contribs[s.i:s.j] {
+				pu := src.RGBA64At(sr.Min.X+int(c.coord), sr.Min.Y+int(y))
+				if srcMask != nil {
+					_, _, _, ma := srcMask.At(smp.X+sr.Min.X+int(c.coord), smp.Y+sr.Min.Y+int(y)).RGBA()
+					pu.R = uint16(uint32(pu.R) * ma / 0xffff)
+					pu.G = uint16(uint32(pu.G) * ma / 0xffff)
+					pu.B = uint16(uint32(pu.B) * ma / 0xffff)
+					pu.A = uint16(uint32(pu.A) * ma / 0xffff)
+				}
+				pr += float64(pu.R) * c.weight
+				pg += float64(pu.G) * c.weight
+				pb += float64(pu.B) * c.weight
+				pa += float64(pu.A) * c.weight
+			}
+			tmp[t] = [4]float64{
+				pr * s.invTotalWeightFFFF,
+				pg * s.invTotalWeightFFFF,
+				pb * s.invTotalWeightFFFF,
+				pa * s.invTotalWeightFFFF,
+			}
+			t++
+		}
+	}
+}
+
 func (z *kernelScaler) scaleX_Image(tmp [][4]float64, src image.Image, sr image.Rectangle, opts *Options) {
 	t := 0
 	srcMask, smp := opts.SrcMask, opts.SrcMaskP
@@ -5009,6 +6170,102 @@
 	}
 }
 
+func (z *kernelScaler) scaleY_RGBA64Image_Over(dst RGBA64Image, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dy, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+dx]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy))
+			pr0 := uint32(ftou(pr * s.invTotalWeight))
+			pg0 := uint32(ftou(pg * s.invTotalWeight))
+			pb0 := uint32(ftou(pb * s.invTotalWeight))
+			pa0 := uint32(ftou(pa * s.invTotalWeight))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+				pr0 = pr0 * ma / 0xffff
+				pg0 = pg0 * ma / 0xffff
+				pb0 = pb0 * ma / 0xffff
+				pa0 = pa0 * ma / 0xffff
+			}
+			pa1 := 0xffff - pa0
+			dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + pr0)
+			dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + pg0)
+			dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + pb0)
+			dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + pa0)
+			dst.SetRGBA64(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColorRGBA64)
+		}
+	}
+}
+
+func (z *kernelScaler) scaleY_RGBA64Image_Src(dst RGBA64Image, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+		for dy, s := range z.vertical.sources[adr.Min.Y:adr.Max.Y] {
+			var pr, pg, pb, pa float64
+			for _, c := range z.vertical.contribs[s.i:s.j] {
+				p := &tmp[c.coord*z.dw+dx]
+				pr += p[0] * c.weight
+				pg += p[1] * c.weight
+				pb += p[2] * c.weight
+				pa += p[3] * c.weight
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			if dstMask != nil {
+				q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy))
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(adr.Min.Y+dy)).RGBA()
+				pr := uint32(ftou(pr*s.invTotalWeight)) * ma / 0xffff
+				pg := uint32(ftou(pg*s.invTotalWeight)) * ma / 0xffff
+				pb := uint32(ftou(pb*s.invTotalWeight)) * ma / 0xffff
+				pa := uint32(ftou(pa*s.invTotalWeight)) * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + pa)
+				dst.SetRGBA64(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColorRGBA64)
+			} else {
+				dstColorRGBA64.R = ftou(pr * s.invTotalWeight)
+				dstColorRGBA64.G = ftou(pg * s.invTotalWeight)
+				dstColorRGBA64.B = ftou(pb * s.invTotalWeight)
+				dstColorRGBA64.A = ftou(pa * s.invTotalWeight)
+				dst.SetRGBA64(dr.Min.X+int(dx), dr.Min.Y+int(adr.Min.Y+dy), dstColorRGBA64)
+			}
+		}
+	}
+}
+
 func (z *kernelScaler) scaleY_Image_Over(dst Image, dr, adr image.Rectangle, tmp [][4]float64, opts *Options) {
 	dstMask, dmp := opts.DstMask, opts.DstMaskP
 	dstColorRGBA64 := &color.RGBA64{}
@@ -6170,6 +7427,233 @@
 	}
 }
 
+func (q *Kernel) transform_RGBA_RGBA64Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pu := src.RGBA64At(kx, ky)
+							pr += float64(pu.R) * w
+							pg += float64(pu.G) * w
+							pb += float64(pu.B) * w
+							pa += float64(pu.A) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			pa1 := (0xffff - uint32(pa0)) * 0x101
+			dst.Pix[d+0] = uint8((uint32(dst.Pix[d+0])*pa1/0xffff + pr0) >> 8)
+			dst.Pix[d+1] = uint8((uint32(dst.Pix[d+1])*pa1/0xffff + pg0) >> 8)
+			dst.Pix[d+2] = uint8((uint32(dst.Pix[d+2])*pa1/0xffff + pb0) >> 8)
+			dst.Pix[d+3] = uint8((uint32(dst.Pix[d+3])*pa1/0xffff + pa0) >> 8)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA_RGBA64Image_Src(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		d := (dr.Min.Y+int(dy)-dst.Rect.Min.Y)*dst.Stride + (dr.Min.X+adr.Min.X-dst.Rect.Min.X)*4
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx, d = dx+1, d+4 {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pu := src.RGBA64At(kx, ky)
+							pr += float64(pu.R) * w
+							pg += float64(pu.G) * w
+							pb += float64(pu.B) * w
+							pa += float64(pu.A) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			dst.Pix[d+0] = uint8(fffftou(pr) >> 8)
+			dst.Pix[d+1] = uint8(fffftou(pg) >> 8)
+			dst.Pix[d+2] = uint8(fffftou(pb) >> 8)
+			dst.Pix[d+3] = uint8(fffftou(pa) >> 8)
+		}
+	}
+}
+
 func (q *Kernel) transform_RGBA_Image_Over(dst *image.RGBA, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
 	// When shrinking, broaden the effective kernel support so that we still
 	// visit every source pixel.
@@ -6397,6 +7881,278 @@
 	}
 }
 
+func (q *Kernel) transform_RGBA64Image_RGBA64Image_Over(dst RGBA64Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pu := src.RGBA64At(kx, ky)
+							if srcMask != nil {
+								_, _, _, ma := srcMask.At(smp.X+kx, smp.Y+ky).RGBA()
+								pu.R = uint16(uint32(pu.R) * ma / 0xffff)
+								pu.G = uint16(uint32(pu.G) * ma / 0xffff)
+								pu.B = uint16(uint32(pu.B) * ma / 0xffff)
+								pu.A = uint16(uint32(pu.A) * ma / 0xffff)
+							}
+							pr += float64(pu.R) * w
+							pg += float64(pu.G) * w
+							pb += float64(pu.B) * w
+							pa += float64(pu.A) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+			pr0 := uint32(fffftou(pr))
+			pg0 := uint32(fffftou(pg))
+			pb0 := uint32(fffftou(pb))
+			pa0 := uint32(fffftou(pa))
+			if dstMask != nil {
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr0 = pr0 * ma / 0xffff
+				pg0 = pg0 * ma / 0xffff
+				pb0 = pb0 * ma / 0xffff
+				pa0 = pa0 * ma / 0xffff
+			}
+			pa1 := 0xffff - pa0
+			dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + pr0)
+			dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + pg0)
+			dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + pb0)
+			dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + pa0)
+			dst.SetRGBA64(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+		}
+	}
+}
+
+func (q *Kernel) transform_RGBA64Image_RGBA64Image_Src(dst RGBA64Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.RGBA64Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
+	// When shrinking, broaden the effective kernel support so that we still
+	// visit every source pixel.
+	xHalfWidth, xKernelArgScale := q.Support, 1.0
+	if xscale > 1 {
+		xHalfWidth *= xscale
+		xKernelArgScale = 1 / xscale
+	}
+	yHalfWidth, yKernelArgScale := q.Support, 1.0
+	if yscale > 1 {
+		yHalfWidth *= yscale
+		yKernelArgScale = 1 / yscale
+	}
+
+	xWeights := make([]float64, 1+2*int(math.Ceil(xHalfWidth)))
+	yWeights := make([]float64, 1+2*int(math.Ceil(yHalfWidth)))
+
+	srcMask, smp := opts.SrcMask, opts.SrcMaskP
+	dstMask, dmp := opts.DstMask, opts.DstMaskP
+	dstColorRGBA64 := color.RGBA64{}
+
+	for dy := int32(adr.Min.Y); dy < int32(adr.Max.Y); dy++ {
+		dyf := float64(dr.Min.Y+int(dy)) + 0.5
+		for dx := int32(adr.Min.X); dx < int32(adr.Max.X); dx++ {
+			dxf := float64(dr.Min.X+int(dx)) + 0.5
+			sx := d2s[0]*dxf + d2s[1]*dyf + d2s[2]
+			sy := d2s[3]*dxf + d2s[4]*dyf + d2s[5]
+			if !(image.Point{int(sx) + bias.X, int(sy) + bias.Y}).In(sr) {
+				continue
+			}
+
+			// TODO: adjust the bias so that we can use int(f) instead
+			// of math.Floor(f) and math.Ceil(f).
+			sx += float64(bias.X)
+			sx -= 0.5
+			ix := int(math.Floor(sx - xHalfWidth))
+			if ix < sr.Min.X {
+				ix = sr.Min.X
+			}
+			jx := int(math.Ceil(sx + xHalfWidth))
+			if jx > sr.Max.X {
+				jx = sr.Max.X
+			}
+
+			totalXWeight := 0.0
+			for kx := ix; kx < jx; kx++ {
+				xWeight := 0.0
+				if t := abs((sx - float64(kx)) * xKernelArgScale); t < q.Support {
+					xWeight = q.At(t)
+				}
+				xWeights[kx-ix] = xWeight
+				totalXWeight += xWeight
+			}
+			for x := range xWeights[:jx-ix] {
+				xWeights[x] /= totalXWeight
+			}
+
+			sy += float64(bias.Y)
+			sy -= 0.5
+			iy := int(math.Floor(sy - yHalfWidth))
+			if iy < sr.Min.Y {
+				iy = sr.Min.Y
+			}
+			jy := int(math.Ceil(sy + yHalfWidth))
+			if jy > sr.Max.Y {
+				jy = sr.Max.Y
+			}
+
+			totalYWeight := 0.0
+			for ky := iy; ky < jy; ky++ {
+				yWeight := 0.0
+				if t := abs((sy - float64(ky)) * yKernelArgScale); t < q.Support {
+					yWeight = q.At(t)
+				}
+				yWeights[ky-iy] = yWeight
+				totalYWeight += yWeight
+			}
+			for y := range yWeights[:jy-iy] {
+				yWeights[y] /= totalYWeight
+			}
+
+			var pr, pg, pb, pa float64
+			for ky := iy; ky < jy; ky++ {
+				if yWeight := yWeights[ky-iy]; yWeight != 0 {
+					for kx := ix; kx < jx; kx++ {
+						if w := xWeights[kx-ix] * yWeight; w != 0 {
+							pu := src.RGBA64At(kx, ky)
+							if srcMask != nil {
+								_, _, _, ma := srcMask.At(smp.X+kx, smp.Y+ky).RGBA()
+								pu.R = uint16(uint32(pu.R) * ma / 0xffff)
+								pu.G = uint16(uint32(pu.G) * ma / 0xffff)
+								pu.B = uint16(uint32(pu.B) * ma / 0xffff)
+								pu.A = uint16(uint32(pu.A) * ma / 0xffff)
+							}
+							pr += float64(pu.R) * w
+							pg += float64(pu.G) * w
+							pb += float64(pu.B) * w
+							pa += float64(pu.A) * w
+						}
+					}
+				}
+			}
+
+			if pr > pa {
+				pr = pa
+			}
+			if pg > pa {
+				pg = pa
+			}
+			if pb > pa {
+				pb = pa
+			}
+
+			if dstMask != nil {
+				q := dst.RGBA64At(dr.Min.X+int(dx), dr.Min.Y+int(dy))
+				_, _, _, ma := dstMask.At(dmp.X+dr.Min.X+int(dx), dmp.Y+dr.Min.Y+int(dy)).RGBA()
+				pr := uint32(fffftou(pr)) * ma / 0xffff
+				pg := uint32(fffftou(pg)) * ma / 0xffff
+				pb := uint32(fffftou(pb)) * ma / 0xffff
+				pa := uint32(fffftou(pa)) * ma / 0xffff
+				pa1 := 0xffff - ma
+				dstColorRGBA64.R = uint16(uint32(q.R)*pa1/0xffff + pr)
+				dstColorRGBA64.G = uint16(uint32(q.G)*pa1/0xffff + pg)
+				dstColorRGBA64.B = uint16(uint32(q.B)*pa1/0xffff + pb)
+				dstColorRGBA64.A = uint16(uint32(q.A)*pa1/0xffff + pa)
+				dst.SetRGBA64(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+			} else {
+				dstColorRGBA64.R = fffftou(pr)
+				dstColorRGBA64.G = fffftou(pg)
+				dstColorRGBA64.B = fffftou(pb)
+				dstColorRGBA64.A = fffftou(pa)
+				dst.SetRGBA64(dr.Min.X+int(dx), dr.Min.Y+int(dy), dstColorRGBA64)
+			}
+		}
+	}
+}
+
 func (q *Kernel) transform_Image_Image_Over(dst Image, dr, adr image.Rectangle, d2s *f64.Aff3, src image.Image, sr image.Rectangle, bias image.Point, xscale, yscale float64, opts *Options) {
 	// When shrinking, broaden the effective kernel support so that we still
 	// visit every source pixel.
diff --git a/draw/scale_test.go b/draw/scale_test.go
index 042a82d..bd6ff36 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -555,11 +555,68 @@
 	dst := image.NewRGBA(bounds)
 	mask := image.NewRGBA(bounds)
 
-	Copy(dst, image.ZP, src, bounds, Src, &Options{
+	Copy(dst, image.Point{}, src, bounds, Src, &Options{
 		DstMask: mask,
 	})
 }
 
+func TestScaleRGBA64ImageAllocations(t *testing.T) {
+	// The goal of RGBA64Image is to prevent heap allocation of the color
+	// argument by using a non-interface type. Assert that we meet that goal.
+	// This assumes there is no fast path for *image.RGBA64.
+	src := image.NewRGBA64(image.Rect(0, 0, 16, 32))
+	dst := image.NewRGBA64(image.Rect(0, 0, 32, 16))
+	fillPix(rand.New(rand.NewSource(1)), src.Pix, dst.Pix)
+	t.Run("Over", func(t *testing.T) {
+		allocs := testing.AllocsPerRun(10, func() {
+			CatmullRom.Scale(dst, dst.Bounds(), src, src.Bounds(), Over, nil)
+		})
+		// Scale and Transform below allocate on their own, so allocations will
+		// never be zero. The expectation we want to check is that the number
+		// of allocations does not scale linearly with the number of pixels in
+		// the image. We could test that directly, but it's sufficient to test
+		// that we have much fewer allocations than the number of pixels, 512.
+		if allocs > 8 {
+			t.Errorf("too many allocations: %v", allocs)
+		}
+	})
+	t.Run("Src", func(t *testing.T) {
+		allocs := testing.AllocsPerRun(10, func() {
+			CatmullRom.Scale(dst, dst.Bounds(), src, src.Bounds(), Src, nil)
+		})
+		if allocs > 8 {
+			t.Errorf("too many allocations: %v", allocs)
+		}
+	})
+}
+
+func TestTransformRGBA64ImageAllocations(t *testing.T) {
+	// This assumes there is no fast path for *image.RGBA64.
+	src := image.NewRGBA64(image.Rect(0, 0, 16, 32))
+	dst := image.NewRGBA64(image.Rect(0, 0, 32, 16))
+	fillPix(rand.New(rand.NewSource(1)), src.Pix, dst.Pix)
+	mat := f64.Aff3{
+		2, 0, 0,
+		0, 0.5, 0,
+	}
+	t.Run("Over", func(t *testing.T) {
+		allocs := testing.AllocsPerRun(10, func() {
+			CatmullRom.Transform(dst, mat, src, src.Bounds(), Over, nil)
+		})
+		if allocs > 8 {
+			t.Errorf("too many allocations: %v", allocs)
+		}
+	})
+	t.Run("Src", func(t *testing.T) {
+		allocs := testing.AllocsPerRun(10, func() {
+			CatmullRom.Transform(dst, mat, src, src.Bounds(), Src, nil)
+		})
+		if allocs > 8 {
+			t.Errorf("too many allocations: %v", allocs)
+		}
+	})
+}
+
 // The fooWrapper types wrap the dst or src image to avoid triggering the
 // type-specific fast path implementations.
 type (
@@ -602,6 +659,12 @@
 	return m, nil
 }
 
+func srcRGBA64(boundsHint image.Rectangle) (image.Image, error) {
+	m := image.NewRGBA64(boundsHint)
+	fillPix(rand.New(rand.NewSource(4)), m.Pix)
+	return m, nil
+}
+
 func srcLarge(boundsHint image.Rectangle) (image.Image, error) {
 	// 3072 x 2304 is over 7 million pixels at 4:3, comparable to a
 	// 2015 smart-phone camera's output.
@@ -686,42 +749,54 @@
 func BenchmarkTformNNOverRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcRGBA, NearestNeighbor) }
 func BenchmarkTformNNOverUnif(b *testing.B) { benchTform(b, 200, 150, Over, srcUnif, NearestNeighbor) }
 
-func BenchmarkScaleABSrcGray(b *testing.B)  { benchScale(b, 200, 150, Src, srcGray, ApproxBiLinear) }
-func BenchmarkScaleABSrcNRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
-func BenchmarkScaleABSrcRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
-func BenchmarkScaleABSrcYCbCr(b *testing.B) { benchScale(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
+func BenchmarkScaleABSrcGray(b *testing.B)   { benchScale(b, 200, 150, Src, srcGray, ApproxBiLinear) }
+func BenchmarkScaleABSrcNRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
+func BenchmarkScaleABSrcRGBA(b *testing.B)   { benchScale(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
+func BenchmarkScaleABSrcYCbCr(b *testing.B)  { benchScale(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
+func BenchmarkScaleABSrcRGBA64(b *testing.B) { benchScale(b, 200, 150, Src, srcRGBA64, ApproxBiLinear) }
 
 func BenchmarkScaleABOverGray(b *testing.B)  { benchScale(b, 200, 150, Over, srcGray, ApproxBiLinear) }
 func BenchmarkScaleABOverNRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcNRGBA, ApproxBiLinear) }
 func BenchmarkScaleABOverRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcRGBA, ApproxBiLinear) }
 func BenchmarkScaleABOverYCbCr(b *testing.B) { benchScale(b, 200, 150, Over, srcYCbCr, ApproxBiLinear) }
+func BenchmarkScaleABOverRGBA64(b *testing.B) {
+	benchScale(b, 200, 150, Over, srcRGBA64, ApproxBiLinear)
+}
 
-func BenchmarkTformABSrcGray(b *testing.B)  { benchTform(b, 200, 150, Src, srcGray, ApproxBiLinear) }
-func BenchmarkTformABSrcNRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
-func BenchmarkTformABSrcRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
-func BenchmarkTformABSrcYCbCr(b *testing.B) { benchTform(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
+func BenchmarkTformABSrcGray(b *testing.B)   { benchTform(b, 200, 150, Src, srcGray, ApproxBiLinear) }
+func BenchmarkTformABSrcNRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcNRGBA, ApproxBiLinear) }
+func BenchmarkTformABSrcRGBA(b *testing.B)   { benchTform(b, 200, 150, Src, srcRGBA, ApproxBiLinear) }
+func BenchmarkTformABSrcYCbCr(b *testing.B)  { benchTform(b, 200, 150, Src, srcYCbCr, ApproxBiLinear) }
+func BenchmarkTformABSrcRGBA64(b *testing.B) { benchTform(b, 200, 150, Src, srcRGBA64, ApproxBiLinear) }
 
 func BenchmarkTformABOverGray(b *testing.B)  { benchTform(b, 200, 150, Over, srcGray, ApproxBiLinear) }
 func BenchmarkTformABOverNRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcNRGBA, ApproxBiLinear) }
 func BenchmarkTformABOverRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcRGBA, ApproxBiLinear) }
 func BenchmarkTformABOverYCbCr(b *testing.B) { benchTform(b, 200, 150, Over, srcYCbCr, ApproxBiLinear) }
+func BenchmarkTformABOverRGBA64(b *testing.B) {
+	benchTform(b, 200, 150, Over, srcRGBA64, ApproxBiLinear)
+}
 
-func BenchmarkScaleCRSrcGray(b *testing.B)  { benchScale(b, 200, 150, Src, srcGray, CatmullRom) }
-func BenchmarkScaleCRSrcNRGBA(b *testing.B) { benchScale(b, 200, 150, Src, srcNRGBA, CatmullRom) }
-func BenchmarkScaleCRSrcRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcRGBA, CatmullRom) }
-func BenchmarkScaleCRSrcYCbCr(b *testing.B) { benchScale(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+func BenchmarkScaleCRSrcGray(b *testing.B)   { benchScale(b, 200, 150, Src, srcGray, CatmullRom) }
+func BenchmarkScaleCRSrcNRGBA(b *testing.B)  { benchScale(b, 200, 150, Src, srcNRGBA, CatmullRom) }
+func BenchmarkScaleCRSrcRGBA(b *testing.B)   { benchScale(b, 200, 150, Src, srcRGBA, CatmullRom) }
+func BenchmarkScaleCRSrcYCbCr(b *testing.B)  { benchScale(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+func BenchmarkScaleCRSrcRGBA64(b *testing.B) { benchScale(b, 200, 150, Src, srcRGBA64, CatmullRom) }
 
-func BenchmarkScaleCROverGray(b *testing.B)  { benchScale(b, 200, 150, Over, srcGray, CatmullRom) }
-func BenchmarkScaleCROverNRGBA(b *testing.B) { benchScale(b, 200, 150, Over, srcNRGBA, CatmullRom) }
-func BenchmarkScaleCROverRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcRGBA, CatmullRom) }
-func BenchmarkScaleCROverYCbCr(b *testing.B) { benchScale(b, 200, 150, Over, srcYCbCr, CatmullRom) }
+func BenchmarkScaleCROverGray(b *testing.B)   { benchScale(b, 200, 150, Over, srcGray, CatmullRom) }
+func BenchmarkScaleCROverNRGBA(b *testing.B)  { benchScale(b, 200, 150, Over, srcNRGBA, CatmullRom) }
+func BenchmarkScaleCROverRGBA(b *testing.B)   { benchScale(b, 200, 150, Over, srcRGBA, CatmullRom) }
+func BenchmarkScaleCROverYCbCr(b *testing.B)  { benchScale(b, 200, 150, Over, srcYCbCr, CatmullRom) }
+func BenchmarkScaleCROverRGBA64(b *testing.B) { benchScale(b, 200, 150, Over, srcRGBA64, CatmullRom) }
 
-func BenchmarkTformCRSrcGray(b *testing.B)  { benchTform(b, 200, 150, Src, srcGray, CatmullRom) }
-func BenchmarkTformCRSrcNRGBA(b *testing.B) { benchTform(b, 200, 150, Src, srcNRGBA, CatmullRom) }
-func BenchmarkTformCRSrcRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcRGBA, CatmullRom) }
-func BenchmarkTformCRSrcYCbCr(b *testing.B) { benchTform(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+func BenchmarkTformCRSrcGray(b *testing.B)   { benchTform(b, 200, 150, Src, srcGray, CatmullRom) }
+func BenchmarkTformCRSrcNRGBA(b *testing.B)  { benchTform(b, 200, 150, Src, srcNRGBA, CatmullRom) }
+func BenchmarkTformCRSrcRGBA(b *testing.B)   { benchTform(b, 200, 150, Src, srcRGBA, CatmullRom) }
+func BenchmarkTformCRSrcYCbCr(b *testing.B)  { benchTform(b, 200, 150, Src, srcYCbCr, CatmullRom) }
+func BenchmarkTformCRSrcRGBA64(b *testing.B) { benchTform(b, 200, 150, Src, srcRGBA64, CatmullRom) }
 
-func BenchmarkTformCROverGray(b *testing.B)  { benchTform(b, 200, 150, Over, srcGray, CatmullRom) }
-func BenchmarkTformCROverNRGBA(b *testing.B) { benchTform(b, 200, 150, Over, srcNRGBA, CatmullRom) }
-func BenchmarkTformCROverRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcRGBA, CatmullRom) }
-func BenchmarkTformCROverYCbCr(b *testing.B) { benchTform(b, 200, 150, Over, srcYCbCr, CatmullRom) }
+func BenchmarkTformCROverGray(b *testing.B)   { benchTform(b, 200, 150, Over, srcGray, CatmullRom) }
+func BenchmarkTformCROverNRGBA(b *testing.B)  { benchTform(b, 200, 150, Over, srcNRGBA, CatmullRom) }
+func BenchmarkTformCROverRGBA(b *testing.B)   { benchTform(b, 200, 150, Over, srcRGBA, CatmullRom) }
+func BenchmarkTformCROverYCbCr(b *testing.B)  { benchTform(b, 200, 150, Over, srcYCbCr, CatmullRom) }
+func BenchmarkTformCROverRGBA64(b *testing.B) { benchTform(b, 200, 150, Over, srcRGBA64, CatmullRom) }