runtime: copy garbage collector from Go 1.8 runtime

This giant patch replaces the old Go 1.4 memory allocator and garbage
collector with the new Go 1.8 code.  The memory allocator is fairly
similar, though now written in Go rather than C.  The garbage
collector is completely different.  It now uses ptrmask and gcprog
information, which requires changes in the compiler and the reflect
package as well as the runtime.  And, of course, the garbage collector
now runs concurrently with program execution.

In the gc toolchain the garbage collector is strict and precise at all
levels.  In the gofrontend we do not have stack maps, so stacks, and
register values, are collected conservatively.  That means that an
old, no longer used, pointer on a stack or in a register can cause a
memory object to live longer than it should.  That in turns means that
we must disable some checks for invalid pointers in the garbage
collection code.  Not only can we get an invalid pointer on the stack;
the concurrent nature of the collector means that we can in effect
resurrect a block that was already unmarked but that the collector had
not yet gotten around to freeing, and that block can in turn point to
other blocks that the collector had managed to already free.  So we
must disable pointer checks in general.  In effect we are relying on
the fact that the strict pointer checks in the gc toolchain ensure
that the garbage collector is correct, and we just assume that it is
correct for the gofrontend since we are using the same code.

Change-Id: Ic7581f1a8ae9113d857f26ccdf536366114255bc
Reviewed-on: https://go-review.googlesource.com/41307
Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/go/runtime.def b/go/runtime.def
index a2f2e3a..635b7fe 100644
--- a/go/runtime.def
+++ b/go/runtime.def
@@ -220,11 +220,11 @@
 
 
 // Register roots (global variables) for the garbage collector.
-DEF_GO_RUNTIME(REGISTER_GC_ROOTS, "__go_register_gc_roots", P1(POINTER), R0())
+DEF_GO_RUNTIME(REGISTER_GC_ROOTS, "runtime.registerGCRoots", P1(POINTER), R0())
 
 
 // Allocate memory.
-DEF_GO_RUNTIME(NEW, "__go_new", P1(TYPE), R1(POINTER))
+DEF_GO_RUNTIME(NEW, "runtime.newobject", P1(TYPE), R1(POINTER))
 
 // Start a new goroutine.
 DEF_GO_RUNTIME(GO, "__go_go", P2(FUNC_PTR, POINTER), R0())
diff --git a/go/types.cc b/go/types.cc
index aa43df3..f2056aa 100644
--- a/go/types.cc
+++ b/go/types.cc
@@ -1400,18 +1400,6 @@
   return type->do_type_descriptor(gogo, name);
 }
 
-// Generate the GC symbol for this TYPE.  VALS is the data so far in this
-// symbol; extra values will be appended in do_gc_symbol.  OFFSET is the
-// offset into the symbol where the GC data is located.  STACK_SIZE is the
-// size of the GC stack when dealing with array types.
-
-void
-Type::gc_symbol(Gogo* gogo, Type* type, Expression_list** vals,
-		Expression** offset, int stack_size)
-{
-  type->do_gc_symbol(gogo, vals, offset, stack_size);
-}
-
 // Make a builtin struct type from a list of fields.  The fields are
 // pairs of a name and a type.
 
@@ -2444,6 +2432,10 @@
   Type* t = this->forwarded();
   while (t->named_type() != NULL && t->named_type()->is_alias())
     t = t->named_type()->real_type()->forwarded();
+
+  if (!t->has_pointer())
+    return gogo->backend()->nil_pointer_expression();
+
   if (t->gc_symbol_var_ == NULL)
     {
       t->make_gc_symbol_var(gogo);
@@ -2492,10 +2484,20 @@
       phash = &ins.first->second;
     }
 
+  int64_t ptrsize;
+  int64_t ptrdata;
+  if (!this->needs_gcprog(gogo, &ptrsize, &ptrdata))
+    {
+      this->gc_symbol_var_ = this->gc_ptrmask_var(gogo, ptrsize, ptrdata);
+      if (phash != NULL)
+	*phash = this->gc_symbol_var_;
+      return;
+    }
+
   std::string sym_name = this->type_descriptor_var_name(gogo, nt) + "$gc";
 
   // Build the contents of the gc symbol.
-  Expression* sym_init = this->gc_symbol_constructor(gogo);
+  Expression* sym_init = this->gcprog_constructor(gogo, ptrsize, ptrdata);
   Btype* sym_btype = sym_init->type()->get_backend(gogo);
 
   // If the type descriptor for this type is defined somewhere else, so is the
@@ -2528,28 +2530,13 @@
       is_common = true;
     }
 
-  // The current garbage collector requires that the GC symbol be
-  // aligned to at least a four byte boundary.  See the use of PRECISE
-  // and LOOP in libgo/runtime/mgc0.c.
-  int64_t align;
-  if (!sym_init->type()->backend_type_align(gogo, &align))
-    go_assert(saw_errors());
-  if (align < 4)
-    align = 4;
-  else
-    {
-      // Use default alignment.
-      align = 0;
-    }
-
   // Since we are building the GC symbol in this package, we must create the
   // variable before converting the initializer to its backend representation
   // because the initializer may refer to the GC symbol for this type.
   std::string asm_name(go_selectively_encode_id(sym_name));
   this->gc_symbol_var_ =
       gogo->backend()->implicit_variable(sym_name, asm_name,
-					 sym_btype, false, true, is_common,
-					 align);
+					 sym_btype, false, true, is_common, 0);
   if (phash != NULL)
     *phash = this->gc_symbol_var_;
 
@@ -2561,68 +2548,6 @@
 					      sym_binit);
 }
 
-// Return an array literal for the Garbage Collection information for this type.
-
-Expression*
-Type::gc_symbol_constructor(Gogo* gogo)
-{
-  Location bloc = Linemap::predeclared_location();
-
-  // The common GC Symbol data starts with the width of the type and ends
-  // with the GC Opcode GC_END.
-  // However, for certain types, the GC symbol may include extra information
-  // before the ending opcode, so we pass the expression list into
-  // Type::gc_symbol to allow it to add extra information as is necessary.
-  Expression_list* vals = new Expression_list;
-
-  Type* uintptr_t = Type::lookup_integer_type("uintptr");
-  // width
-  vals->push_back(Expression::make_type_info(this,
-					     Expression::TYPE_INFO_SIZE));
-
-  Expression* offset = Expression::make_integer_ul(0, uintptr_t, bloc);
-
-  this->do_gc_symbol(gogo, &vals, &offset, 0);
-
-  vals->push_back(Expression::make_integer_ul(GC_END, uintptr_t, bloc));
-
-  Expression* len = Expression::make_integer_ul(vals->size(), NULL,
-						bloc);
-  Array_type* gc_symbol_type = Type::make_array_type(uintptr_t, len);
-  gc_symbol_type->set_is_array_incomparable();
-  return Expression::make_array_composite_literal(gc_symbol_type, vals, bloc);
-}
-
-// A temporary function to convert a GC_symbol_expression, with type
-// *byte, to type uintptr, so that it can be stored in a GC symbol.
-// This will be discarded when we move to the Go 1.8 garbage
-// collector.
-
-static Expression*
-to_uintptr(Expression* expr)
-{
-  go_assert(expr->type()->points_to() != NULL);
-  Location loc = expr->location();
-  expr = Expression::make_cast(Type::make_pointer_type(Type::make_void_type()),
-			       expr, loc);
-  return Expression::make_cast(Type::lookup_integer_type("uintptr"), expr,
-			       loc);
-}
-
-// Advance the OFFSET of the GC symbol by this type's width.
-
-void
-Type::advance_gc_offset(Expression** offset)
-{
-  if (this->is_error_type())
-    return;
-
-  Location bloc = Linemap::predeclared_location();
-  Expression* width =
-    Expression::make_type_info(this, Expression::TYPE_INFO_SIZE);
-  *offset = Expression::make_binary(OPERATOR_PLUS, *offset, width, bloc);
-}
-
 // Return whether this type needs a GC program, and set *PTRDATA to
 // the size of the pointer data in bytes and *PTRSIZE to the size of a
 // pointer.
@@ -3862,10 +3787,6 @@
   { go_assert(saw_errors()); }
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int)
-  { go_assert(saw_errors()); }
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const
   { ret->push_back('E'); }
 };
@@ -3904,10 +3825,6 @@
   { }
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int)
-  { }
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const
   { ret->push_back('v'); }
 };
@@ -3946,9 +3863,6 @@
   { ret->append("bool"); }
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const
   { ret->push_back('b'); }
 };
@@ -3968,12 +3882,6 @@
     }
 }
 
-// Update the offset of the GC symbol.
-
-void
-Boolean_type::do_gc_symbol(Gogo*, Expression_list**, Expression** offset, int)
-{ this->advance_gc_offset(offset); }
-
 Type*
 Type::make_boolean_type()
 {
@@ -4483,20 +4391,6 @@
   ret->append("string");
 }
 
-// Generate GC symbol for strings.
-
-void
-String_type::do_gc_symbol(Gogo*, Expression_list** vals,
-			  Expression** offset, int)
-{
-  Location bloc = Linemap::predeclared_location();
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-  (*vals)->push_back(Expression::make_integer_ul(GC_STRING, uintptr_type,
-						 bloc));
-  (*vals)->push_back(*offset);
-  this->advance_gc_offset(offset);
-}
-
 // Mangled name of a string type.
 
 void
@@ -4568,10 +4462,6 @@
   { go_unreachable(); }
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int)
-  { go_unreachable(); }
-
-  void
   do_mangled_name(Gogo*, std::string*) const
   { go_unreachable(); }
 };
@@ -5154,22 +5044,6 @@
     }
 }
 
-// Generate GC symbol for a function type.
-
-void
-Function_type::do_gc_symbol(Gogo*, Expression_list** vals,
-			    Expression** offset, int)
-{
-  Location bloc = Linemap::predeclared_location();
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-
-  // We use GC_APTR here because we do not currently have a way to describe the
-  // the type of the possible function closure.  FIXME.
-  (*vals)->push_back(Expression::make_integer_ul(GC_APTR, uintptr_type, bloc));
-  (*vals)->push_back(*offset);
-  this->advance_gc_offset(offset);
-}
-
 // Mangled name.
 
 void
@@ -5572,26 +5446,6 @@
   this->append_reflection(this->to_type_, gogo, ret);
 }
 
-// Generate GC symbol for pointer types.
-
-void
-Pointer_type::do_gc_symbol(Gogo*, Expression_list** vals,
-			   Expression** offset, int)
-{
-  Location loc = Linemap::predeclared_location();
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-
-  unsigned long opval = this->to_type_->has_pointer() ? GC_PTR : GC_APTR;
-  (*vals)->push_back(Expression::make_integer_ul(opval, uintptr_type, loc));
-  (*vals)->push_back(*offset);
-
-  if (this->to_type_->has_pointer())
-    (*vals)->push_back(to_uintptr(Expression::make_gc_symbol(this->to_type_)));
-  this->advance_gc_offset(offset);
-}
-
-// Mangled name.
-
 void
 Pointer_type::do_mangled_name(Gogo* gogo, std::string* ret) const
 {
@@ -5670,10 +5524,6 @@
   { go_unreachable(); }
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int)
-  { go_unreachable(); }
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const
   { ret->push_back('n'); }
 };
@@ -5728,10 +5578,6 @@
   { go_assert(saw_errors()); }
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int)
-  { go_unreachable(); }
-
-  void
   do_mangled_name(Gogo*, std::string*) const
   { go_assert(saw_errors()); }
 
@@ -6744,27 +6590,6 @@
   ret->push_back('}');
 }
 
-// Generate GC symbol for struct types.
-
-void
-Struct_type::do_gc_symbol(Gogo* gogo, Expression_list** vals,
-			  Expression** offset, int stack_size)
-{
-  Location bloc = Linemap::predeclared_location();
-  const Struct_field_list* sfl = this->fields();
-  for (Struct_field_list::const_iterator p = sfl->begin();
-       p != sfl->end();
-       ++p)
-    {
-      Expression* field_offset =
-  	Expression::make_struct_field_offset(this, &*p);
-      Expression* o =
-  	Expression::make_binary(OPERATOR_PLUS, *offset, field_offset, bloc);
-      Type::gc_symbol(gogo, p->type(), vals, &o, stack_size);
-    }
-  this->advance_gc_offset(offset);
-}
-
 // Mangled name.
 
 void
@@ -7998,120 +7823,6 @@
   this->append_reflection(this->element_type_, gogo, ret);
 }
 
-// GC Symbol construction for array types.
-
-void
-Array_type::do_gc_symbol(Gogo* gogo, Expression_list** vals,
-			 Expression** offset, int stack_size)
-{
-  if (this->length_ == NULL)
-    this->slice_gc_symbol(gogo, vals, offset, stack_size);
-  else
-    this->array_gc_symbol(gogo, vals, offset, stack_size);
-}
-
-// Generate the GC Symbol for a slice.
-
-void
-Array_type::slice_gc_symbol(Gogo* gogo, Expression_list** vals,
-			    Expression** offset, int)
-{
-  Location bloc = Linemap::predeclared_location();
-
-  // Differentiate between slices with zero-length and non-zero-length values.
-  Type* element_type = this->element_type();
-  int64_t element_size;
-  bool ok = element_type->backend_type_size(gogo, &element_size);
-  if (!ok) {
-    go_assert(saw_errors());
-    element_size = 4;
-  }
-
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-  unsigned long opval = element_size == 0 ? GC_APTR : GC_SLICE;
-  (*vals)->push_back(Expression::make_integer_ul(opval, uintptr_type, bloc));
-  (*vals)->push_back(*offset);
-
-  if (element_size != 0 && ok)
-    (*vals)->push_back(to_uintptr(Expression::make_gc_symbol(element_type)));
-  this->advance_gc_offset(offset);
-}
-
-// Generate the GC symbol for an array.
-
-void
-Array_type::array_gc_symbol(Gogo* gogo, Expression_list** vals,
-			    Expression** offset, int stack_size)
-{
-  Location bloc = Linemap::predeclared_location();
-
-  Numeric_constant nc;
-  unsigned long bound;
-  if (!this->length_->numeric_constant_value(&nc)
-      || nc.to_unsigned_long(&bound) == Numeric_constant::NC_UL_NOTINT)
-    {
-      go_assert(saw_errors());
-      return;
-    }
-
-  Btype* pbtype = gogo->backend()->pointer_type(gogo->backend()->void_type());
-  int64_t pwidth = gogo->backend()->type_size(pbtype);
-  int64_t iwidth;
-  bool ok = this->backend_type_size(gogo, &iwidth);
-  if (!ok)
-    {
-      go_assert(saw_errors());
-      iwidth = 4;
-    }
-
-  Type* element_type = this->element_type();
-  if (bound < 1 || !element_type->has_pointer())
-    this->advance_gc_offset(offset);
-  else if (ok && (bound == 1 || iwidth <= 4 * pwidth))
-    {
-      for (unsigned int i = 0; i < bound; ++i)
-	Type::gc_symbol(gogo, element_type, vals, offset, stack_size);
-    }
-  else
-    {
-      Type* uintptr_type = Type::lookup_integer_type("uintptr");
-
-      if (stack_size < GC_STACK_CAPACITY)
-  	{
-	  (*vals)->push_back(Expression::make_integer_ul(GC_ARRAY_START,
-							 uintptr_type, bloc));
-  	  (*vals)->push_back(*offset);
-	  Expression* uintptr_len =
-	    Expression::make_cast(uintptr_type, this->length_, bloc);
-  	  (*vals)->push_back(uintptr_len);
-
-	  Expression* width =
-	    Expression::make_type_info(element_type,
-				       Expression::TYPE_INFO_SIZE);
-  	  (*vals)->push_back(width);
-
-	  Expression* offset2 = Expression::make_integer_ul(0, uintptr_type,
-							    bloc);
-
-	  Type::gc_symbol(gogo, element_type, vals, &offset2, stack_size + 1);
-	  (*vals)->push_back(Expression::make_integer_ul(GC_ARRAY_NEXT,
-							 uintptr_type, bloc));
-  	}
-      else
-  	{
-	  (*vals)->push_back(Expression::make_integer_ul(GC_REGION,
-							 uintptr_type, bloc));
-	  (*vals)->push_back(*offset);
-
-	  Expression* width =
-	    Expression::make_type_info(this, Expression::TYPE_INFO_SIZE);
-  	  (*vals)->push_back(width);
-	  (*vals)->push_back(to_uintptr(Expression::make_gc_symbol(this)));
-  	}
-      this->advance_gc_offset(offset);
-    }
-}
-
 // Mangled name.
 
 void
@@ -8737,21 +8448,6 @@
   this->append_reflection(this->val_type_, gogo, ret);
 }
 
-// Generate GC symbol for a map.
-
-void
-Map_type::do_gc_symbol(Gogo*, Expression_list** vals,
-		       Expression** offset, int)
-{
-  // TODO(cmang): Generate GC data for the Map elements.
-  Location bloc = Linemap::predeclared_location();
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-
-  (*vals)->push_back(Expression::make_integer_ul(GC_APTR, uintptr_type, bloc));
-  (*vals)->push_back(*offset);
-  this->advance_gc_offset(offset);
-}
-
 // Mangled name for a map.
 
 void
@@ -8922,28 +8618,6 @@
   this->append_reflection(this->element_type_, gogo, ret);
 }
 
-// Generate GC symbol for channels.
-
-void
-Channel_type::do_gc_symbol(Gogo*, Expression_list** vals,
-			   Expression** offset, int)
-{
-  Location bloc = Linemap::predeclared_location();
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-
-  (*vals)->push_back(Expression::make_integer_ul(GC_CHAN_PTR, uintptr_type,
-						 bloc));
-  (*vals)->push_back(*offset);
- 
-  Type* unsafeptr_type = Type::make_pointer_type(Type::make_void_type());
-  Expression* type_descriptor =
-    Expression::make_type_descriptor(this, bloc);
-  type_descriptor =
-    Expression::make_unsafe_cast(unsafeptr_type, type_descriptor, bloc);
-  (*vals)->push_back(type_descriptor);
-  this->advance_gc_offset(offset);
-}
-
 // Mangled name.
 
 void
@@ -9880,21 +9554,6 @@
   ret->append("}");
 }
 
-// Generate GC symbol for interface types.
-
-void
-Interface_type::do_gc_symbol(Gogo*, Expression_list** vals,
-			     Expression** offset, int)
-{
-  Location bloc = Linemap::predeclared_location();
-  Type* uintptr_type = Type::lookup_integer_type("uintptr");
-
-  unsigned long opval = this->is_empty() ? GC_EFACE : GC_IFACE;
-  (*vals)->push_back(Expression::make_integer_ul(opval, uintptr_type, bloc));
-  (*vals)->push_back(*offset);
-  this->advance_gc_offset(offset);
-}
-
 // Mangled name.
 
 void
@@ -11369,20 +11028,6 @@
   ret->append(Gogo::unpack_hidden_name(this->named_object_->name()));
 }
 
-// Generate GC symbol for named types.
-
-void
-Named_type::do_gc_symbol(Gogo* gogo, Expression_list** vals,
-			 Expression** offset, int stack)
-{
-  if (!this->seen_)
-    {
-      this->seen_ = true;
-      Type::gc_symbol(gogo, this->real_type(), vals, offset, stack);
-      this->seen_ = false;
-    }
-}
-
 // Get the mangled name.
 
 void
diff --git a/go/types.h b/go/types.h
index f76fab8..e0fcf0c 100644
--- a/go/types.h
+++ b/go/types.h
@@ -89,28 +89,6 @@
 static const int RUNTIME_TYPE_KIND_GC_PROG = (1 << 6);
 static const int RUNTIME_TYPE_KIND_NO_POINTERS = (1 << 7);
 
-// GC instruction opcodes.  These must match the values in libgo/runtime/mgc0.h.
-enum GC_Opcode
-{
-  GC_END = 0,     // End of object, loop or subroutine.
-  GC_PTR,         // A typed pointer.
-  GC_APTR,        // Pointer to an arbitrary object.
-  GC_ARRAY_START, // Start an array with a fixed length.
-  GC_ARRAY_NEXT,  // The next element of an array.
-  GC_CALL,        // Call a subroutine.
-  GC_CHAN_PTR,    // Go channel.
-  GC_STRING,      // Go string.
-  GC_EFACE,       // interface{}.
-  GC_IFACE,       // interface{...}.
-  GC_SLICE,       // Go slice.
-  GC_REGION,      // A region/part of the current object.
-
-  GC_NUM_INSTR    // Number of instruction opcodes
-};
-
-// The GC Stack Capacity must match the value in libgo/runtime/mgc0.h.
-static const int GC_STACK_CAPACITY = 8;
-
 // To build the complete list of methods for a named type we need to
 // gather all methods from anonymous fields.  Those methods may
 // require an arbitrary set of indirections and field offsets.  There
@@ -1067,9 +1045,6 @@
   do_type_descriptor(Gogo*, Named_type* name) = 0;
 
   virtual void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int) = 0;
-
-  virtual void
   do_reflection(Gogo*, std::string*) const = 0;
 
   virtual void
@@ -1124,22 +1099,6 @@
   type_descriptor_constructor(Gogo*, int runtime_type_kind, Named_type*,
 			      const Methods*, bool only_value_methods);
 
-  // Generate the GC symbol for this TYPE.  VALS is the data so far in this
-  // symbol; extra values will be appended in do_gc_symbol.  OFFSET is the
-  // offset into the symbol where the GC data is located.  STACK_SIZE is the
-  // size of the GC stack when dealing with array types.
-  static void
-  gc_symbol(Gogo*, Type* type, Expression_list** vals, Expression** offset,
-	    int stack_size);
-
-  // Build a composite literal for the GC symbol of this type.
-  Expression*
-  gc_symbol_constructor(Gogo*);
-
-  // Advance the OFFSET of the GC symbol by the size of this type.
-  void
-  advance_gc_offset(Expression** offset);
-
   // For the benefit of child class reflection string generation.
   void
   append_reflection(const Type* type, Gogo* gogo, std::string* ret) const
@@ -1639,10 +1598,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression** offset, int)
-  { this->advance_gc_offset(offset); }
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
  private:
@@ -1729,10 +1684,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression** offset, int)
-  { this->advance_gc_offset(offset); }
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
  private:
@@ -1811,10 +1762,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression** offset, int)
-  { this->advance_gc_offset(offset); }
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
  private:
@@ -1869,9 +1816,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const;
 
  private:
@@ -2027,9 +1971,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -2153,9 +2094,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -2475,9 +2413,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -2650,9 +2585,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -2668,12 +2600,6 @@
   Expression*
   slice_type_descriptor(Gogo*, Named_type*);
 
-  void
-  slice_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
-  array_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
   // The type of elements of the array.
   Type* element_type_;
   // The number of elements.  This may be NULL.
@@ -2773,9 +2699,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -2888,9 +2811,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -3035,9 +2955,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo*, Expression_list**, Expression**, int);
-
-  void
   do_mangled_name(Gogo*, std::string*) const;
 
   void
@@ -3343,10 +3260,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo* gogo, Expression_list** vals, Expression** offset,
-	       int stack);
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const;
 
   void
@@ -3503,11 +3416,6 @@
   do_reflection(Gogo*, std::string*) const;
 
   void
-  do_gc_symbol(Gogo* gogo, Expression_list** vals, Expression** offset,
-	       int stack_size)
-  { Type::gc_symbol(gogo, this->real_type(), vals, offset, stack_size); }
-
-  void
   do_mangled_name(Gogo*, std::string* ret) const;
 
   void
diff --git a/libgo/Makefile.am b/libgo/Makefile.am
index 3c9d1ff..545ca05 100644
--- a/libgo/Makefile.am
+++ b/libgo/Makefile.am
@@ -392,12 +392,6 @@
 	unicode/utf16.gox \
 	unicode/utf8.gox
 
-if HAVE_SYS_MMAN_H
-runtime_mem_file = runtime/mem.c
-else
-runtime_mem_file = runtime/mem_posix_memalign.c
-endif
-
 if LIBGO_IS_RTEMS
 rtems_task_variable_add_file = runtime/rtems-task-variable-add.c
 else
@@ -457,7 +451,6 @@
 	runtime/go-memmove.c \
 	runtime/go-nanotime.c \
 	runtime/go-now.c \
-	runtime/go-new.c \
 	runtime/go-nosys.c \
 	runtime/go-reflect-call.c \
 	runtime/go-runtime-error.c \
@@ -465,31 +458,20 @@
 	runtime/go-signal.c \
 	runtime/go-strslice.c \
 	runtime/go-typedesc-equal.c \
-	runtime/go-unsafe-new.c \
-	runtime/go-unsafe-newarray.c \
 	runtime/go-unsafe-pointer.c \
 	runtime/go-unsetenv.c \
 	runtime/go-unwind.c \
 	runtime/go-varargs.c \
 	runtime/env_posix.c \
-	runtime/heapdump.c \
-	runtime/mcache.c \
-	runtime/mcentral.c \
-	$(runtime_mem_file) \
-	runtime/mfixalloc.c \
-	runtime/mgc0.c \
-	runtime/mheap.c \
-	runtime/msize.c \
 	runtime/panic.c \
-	runtime/parfor.c \
 	runtime/print.c \
 	runtime/proc.c \
 	runtime/runtime_c.c \
+	runtime/stack.c \
 	runtime/thread.c \
 	$(runtime_thread_files) \
 	runtime/yield.c \
 	$(rtems_task_variable_add_file) \
-	malloc.c \
 	$(runtime_getncpu_file)
 
 goc2c.$(OBJEXT): runtime/goc2c.c
@@ -498,10 +480,6 @@
 goc2c: goc2c.$(OBJEXT)
 	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) -o $@ $<
 
-malloc.c: $(srcdir)/runtime/malloc.goc goc2c
-	./goc2c $< > $@.tmp
-	mv -f $@.tmp $@
-
 %.c: $(srcdir)/runtime/%.goc goc2c
 	./goc2c $< > $@.tmp
 	mv -f $@.tmp $@
@@ -579,7 +557,7 @@
 runtime.inc: s-runtime-inc; @true
 s-runtime-inc: runtime.lo Makefile
 	rm -f runtime.inc.tmp2
-	grep -v "#define _" runtime.inc.tmp | grep -v "#define [cm][01234] " > runtime.inc.tmp2
+	grep -v "#define _" runtime.inc.tmp | grep -v "#define [cm][01234] " | grep -v "#define empty " > runtime.inc.tmp2
 	for pattern in '_[GP][a-z]' _Max _Lock _Sig _Trace _MHeap _Num; do \
 	  grep "#define $$pattern" runtime.inc.tmp >> runtime.inc.tmp2; \
 	done
diff --git a/libgo/Makefile.in b/libgo/Makefile.in
index b9c5b92..fdb162b 100644
--- a/libgo/Makefile.in
+++ b/libgo/Makefile.in
@@ -183,40 +183,36 @@
 	$(am__DEPENDENCIES_3) $(am__DEPENDENCIES_3) \
 	$(am__DEPENDENCIES_3) $(am__DEPENDENCIES_3)
 libgo_llgo_la_DEPENDENCIES = $(am__DEPENDENCIES_4)
-@HAVE_SYS_MMAN_H_FALSE@am__objects_1 = mem_posix_memalign.lo
-@HAVE_SYS_MMAN_H_TRUE@am__objects_1 = mem.lo
-@LIBGO_IS_LINUX_FALSE@am__objects_2 = thread-sema.lo
-@LIBGO_IS_LINUX_TRUE@am__objects_2 = thread-linux.lo
-@LIBGO_IS_RTEMS_TRUE@am__objects_3 = rtems-task-variable-add.lo
-@LIBGO_IS_AIX_FALSE@@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_NETBSD_FALSE@@LIBGO_IS_SOLARIS_FALSE@am__objects_4 = getncpu-none.lo
-@LIBGO_IS_AIX_TRUE@@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_NETBSD_FALSE@@LIBGO_IS_SOLARIS_FALSE@am__objects_4 = getncpu-aix.lo
-@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_NETBSD_TRUE@@LIBGO_IS_SOLARIS_FALSE@am__objects_4 = getncpu-bsd.lo
-@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_TRUE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_SOLARIS_FALSE@am__objects_4 = getncpu-bsd.lo
-@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_SOLARIS_TRUE@am__objects_4 = getncpu-solaris.lo
-@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_IRIX_TRUE@@LIBGO_IS_LINUX_FALSE@am__objects_4 = getncpu-irix.lo
-@LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@am__objects_4 =  \
+@LIBGO_IS_LINUX_FALSE@am__objects_1 = thread-sema.lo
+@LIBGO_IS_LINUX_TRUE@am__objects_1 = thread-linux.lo
+@LIBGO_IS_RTEMS_TRUE@am__objects_2 = rtems-task-variable-add.lo
+@LIBGO_IS_AIX_FALSE@@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_NETBSD_FALSE@@LIBGO_IS_SOLARIS_FALSE@am__objects_3 = getncpu-none.lo
+@LIBGO_IS_AIX_TRUE@@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_NETBSD_FALSE@@LIBGO_IS_SOLARIS_FALSE@am__objects_3 = getncpu-aix.lo
+@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_NETBSD_TRUE@@LIBGO_IS_SOLARIS_FALSE@am__objects_3 = getncpu-bsd.lo
+@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_FREEBSD_TRUE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_SOLARIS_FALSE@am__objects_3 = getncpu-bsd.lo
+@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_IRIX_FALSE@@LIBGO_IS_LINUX_FALSE@@LIBGO_IS_SOLARIS_TRUE@am__objects_3 = getncpu-solaris.lo
+@LIBGO_IS_DARWIN_FALSE@@LIBGO_IS_IRIX_TRUE@@LIBGO_IS_LINUX_FALSE@am__objects_3 = getncpu-irix.lo
+@LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@am__objects_3 =  \
 @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@	getncpu-bsd.lo
-@LIBGO_IS_LINUX_TRUE@am__objects_4 = getncpu-linux.lo
-am__objects_5 = aeshash.lo go-assert.lo go-caller.lo go-callers.lo \
+@LIBGO_IS_LINUX_TRUE@am__objects_3 = getncpu-linux.lo
+am__objects_4 = aeshash.lo go-assert.lo go-caller.lo go-callers.lo \
 	go-cdiv.lo go-cgo.lo go-construct-map.lo go-ffi.lo \
 	go-fieldtrack.lo go-matherr.lo go-memclr.lo go-memcmp.lo \
 	go-memequal.lo go-memmove.lo go-nanotime.lo go-now.lo \
-	go-new.lo go-nosys.lo go-reflect-call.lo go-runtime-error.lo \
+	go-nosys.lo go-reflect-call.lo go-runtime-error.lo \
 	go-setenv.lo go-signal.lo go-strslice.lo go-typedesc-equal.lo \
-	go-unsafe-new.lo go-unsafe-newarray.lo go-unsafe-pointer.lo \
-	go-unsetenv.lo go-unwind.lo go-varargs.lo env_posix.lo \
-	heapdump.lo mcache.lo mcentral.lo $(am__objects_1) \
-	mfixalloc.lo mgc0.lo mheap.lo msize.lo panic.lo parfor.lo \
-	print.lo proc.lo runtime_c.lo thread.lo $(am__objects_2) \
-	yield.lo $(am__objects_3) malloc.lo $(am__objects_4)
-am_libgo_llgo_la_OBJECTS = $(am__objects_5)
+	go-unsafe-pointer.lo go-unsetenv.lo go-unwind.lo go-varargs.lo \
+	env_posix.lo panic.lo print.lo proc.lo runtime_c.lo stack.lo \
+	thread.lo $(am__objects_1) yield.lo $(am__objects_2) \
+	$(am__objects_3)
+am_libgo_llgo_la_OBJECTS = $(am__objects_4)
 libgo_llgo_la_OBJECTS = $(am_libgo_llgo_la_OBJECTS)
 libgo_llgo_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
 	$(libgo_llgo_la_LDFLAGS) $(LDFLAGS) -o $@
 @GOC_IS_LLGO_TRUE@am_libgo_llgo_la_rpath = -rpath $(toolexeclibdir)
 libgo_la_DEPENDENCIES = $(am__DEPENDENCIES_4)
-am_libgo_la_OBJECTS = $(am__objects_5)
+am_libgo_la_OBJECTS = $(am__objects_4)
 libgo_la_OBJECTS = $(am_libgo_la_OBJECTS)
 libgo_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
 	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(libgo_la_LDFLAGS) \
@@ -767,8 +763,6 @@
 	unicode/utf16.gox \
 	unicode/utf8.gox
 
-@HAVE_SYS_MMAN_H_FALSE@runtime_mem_file = runtime/mem_posix_memalign.c
-@HAVE_SYS_MMAN_H_TRUE@runtime_mem_file = runtime/mem.c
 @LIBGO_IS_RTEMS_FALSE@rtems_task_variable_add_file = 
 @LIBGO_IS_RTEMS_TRUE@rtems_task_variable_add_file = runtime/rtems-task-variable-add.c
 @LIBGO_IS_LINUX_FALSE@runtime_thread_files = runtime/thread-sema.c
@@ -798,7 +792,6 @@
 	runtime/go-memmove.c \
 	runtime/go-nanotime.c \
 	runtime/go-now.c \
-	runtime/go-new.c \
 	runtime/go-nosys.c \
 	runtime/go-reflect-call.c \
 	runtime/go-runtime-error.c \
@@ -806,31 +799,20 @@
 	runtime/go-signal.c \
 	runtime/go-strslice.c \
 	runtime/go-typedesc-equal.c \
-	runtime/go-unsafe-new.c \
-	runtime/go-unsafe-newarray.c \
 	runtime/go-unsafe-pointer.c \
 	runtime/go-unsetenv.c \
 	runtime/go-unwind.c \
 	runtime/go-varargs.c \
 	runtime/env_posix.c \
-	runtime/heapdump.c \
-	runtime/mcache.c \
-	runtime/mcentral.c \
-	$(runtime_mem_file) \
-	runtime/mfixalloc.c \
-	runtime/mgc0.c \
-	runtime/mheap.c \
-	runtime/msize.c \
 	runtime/panic.c \
-	runtime/parfor.c \
 	runtime/print.c \
 	runtime/proc.c \
 	runtime/runtime_c.c \
+	runtime/stack.c \
 	runtime/thread.c \
 	$(runtime_thread_files) \
 	runtime/yield.c \
 	$(rtems_task_variable_add_file) \
-	malloc.c \
 	$(runtime_getncpu_file)
 
 noinst_DATA = zstdpkglist.go
@@ -1516,7 +1498,6 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-memequal.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-memmove.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-nanotime.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-new.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-nosys.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-now.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-reflect-call.Plo@am__quote@
@@ -1525,31 +1506,19 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-signal.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-strslice.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-typedesc-equal.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-unsafe-new.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-unsafe-newarray.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-unsafe-pointer.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-unsetenv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-unwind.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/go-varargs.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/heapdump.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgobegin_a-go-main.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgobegin_llgo_a-go-main.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgolibbegin_a-go-libmain.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/malloc.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mcache.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mcentral.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mem.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mem_posix_memalign.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mfixalloc.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mgc0.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mheap.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msize.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/panic.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parfor.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/print.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rtems-task-variable-add.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/runtime_c.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stack.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thread-linux.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thread-sema.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thread.Plo@am__quote@
@@ -1730,13 +1699,6 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o go-now.lo `test -f 'runtime/go-now.c' || echo '$(srcdir)/'`runtime/go-now.c
 
-go-new.lo: runtime/go-new.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-new.lo -MD -MP -MF $(DEPDIR)/go-new.Tpo -c -o go-new.lo `test -f 'runtime/go-new.c' || echo '$(srcdir)/'`runtime/go-new.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/go-new.Tpo $(DEPDIR)/go-new.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/go-new.c' object='go-new.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o go-new.lo `test -f 'runtime/go-new.c' || echo '$(srcdir)/'`runtime/go-new.c
-
 go-nosys.lo: runtime/go-nosys.c
 @am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-nosys.lo -MD -MP -MF $(DEPDIR)/go-nosys.Tpo -c -o go-nosys.lo `test -f 'runtime/go-nosys.c' || echo '$(srcdir)/'`runtime/go-nosys.c
 @am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/go-nosys.Tpo $(DEPDIR)/go-nosys.Plo
@@ -1786,20 +1748,6 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o go-typedesc-equal.lo `test -f 'runtime/go-typedesc-equal.c' || echo '$(srcdir)/'`runtime/go-typedesc-equal.c
 
-go-unsafe-new.lo: runtime/go-unsafe-new.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-unsafe-new.lo -MD -MP -MF $(DEPDIR)/go-unsafe-new.Tpo -c -o go-unsafe-new.lo `test -f 'runtime/go-unsafe-new.c' || echo '$(srcdir)/'`runtime/go-unsafe-new.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/go-unsafe-new.Tpo $(DEPDIR)/go-unsafe-new.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/go-unsafe-new.c' object='go-unsafe-new.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o go-unsafe-new.lo `test -f 'runtime/go-unsafe-new.c' || echo '$(srcdir)/'`runtime/go-unsafe-new.c
-
-go-unsafe-newarray.lo: runtime/go-unsafe-newarray.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-unsafe-newarray.lo -MD -MP -MF $(DEPDIR)/go-unsafe-newarray.Tpo -c -o go-unsafe-newarray.lo `test -f 'runtime/go-unsafe-newarray.c' || echo '$(srcdir)/'`runtime/go-unsafe-newarray.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/go-unsafe-newarray.Tpo $(DEPDIR)/go-unsafe-newarray.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/go-unsafe-newarray.c' object='go-unsafe-newarray.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o go-unsafe-newarray.lo `test -f 'runtime/go-unsafe-newarray.c' || echo '$(srcdir)/'`runtime/go-unsafe-newarray.c
-
 go-unsafe-pointer.lo: runtime/go-unsafe-pointer.c
 @am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-unsafe-pointer.lo -MD -MP -MF $(DEPDIR)/go-unsafe-pointer.Tpo -c -o go-unsafe-pointer.lo `test -f 'runtime/go-unsafe-pointer.c' || echo '$(srcdir)/'`runtime/go-unsafe-pointer.c
 @am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/go-unsafe-pointer.Tpo $(DEPDIR)/go-unsafe-pointer.Plo
@@ -1835,69 +1783,6 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o env_posix.lo `test -f 'runtime/env_posix.c' || echo '$(srcdir)/'`runtime/env_posix.c
 
-heapdump.lo: runtime/heapdump.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT heapdump.lo -MD -MP -MF $(DEPDIR)/heapdump.Tpo -c -o heapdump.lo `test -f 'runtime/heapdump.c' || echo '$(srcdir)/'`runtime/heapdump.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/heapdump.Tpo $(DEPDIR)/heapdump.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/heapdump.c' object='heapdump.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o heapdump.lo `test -f 'runtime/heapdump.c' || echo '$(srcdir)/'`runtime/heapdump.c
-
-mcache.lo: runtime/mcache.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mcache.lo -MD -MP -MF $(DEPDIR)/mcache.Tpo -c -o mcache.lo `test -f 'runtime/mcache.c' || echo '$(srcdir)/'`runtime/mcache.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mcache.Tpo $(DEPDIR)/mcache.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mcache.c' object='mcache.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mcache.lo `test -f 'runtime/mcache.c' || echo '$(srcdir)/'`runtime/mcache.c
-
-mcentral.lo: runtime/mcentral.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mcentral.lo -MD -MP -MF $(DEPDIR)/mcentral.Tpo -c -o mcentral.lo `test -f 'runtime/mcentral.c' || echo '$(srcdir)/'`runtime/mcentral.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mcentral.Tpo $(DEPDIR)/mcentral.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mcentral.c' object='mcentral.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mcentral.lo `test -f 'runtime/mcentral.c' || echo '$(srcdir)/'`runtime/mcentral.c
-
-mem_posix_memalign.lo: runtime/mem_posix_memalign.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mem_posix_memalign.lo -MD -MP -MF $(DEPDIR)/mem_posix_memalign.Tpo -c -o mem_posix_memalign.lo `test -f 'runtime/mem_posix_memalign.c' || echo '$(srcdir)/'`runtime/mem_posix_memalign.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mem_posix_memalign.Tpo $(DEPDIR)/mem_posix_memalign.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mem_posix_memalign.c' object='mem_posix_memalign.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mem_posix_memalign.lo `test -f 'runtime/mem_posix_memalign.c' || echo '$(srcdir)/'`runtime/mem_posix_memalign.c
-
-mem.lo: runtime/mem.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mem.lo -MD -MP -MF $(DEPDIR)/mem.Tpo -c -o mem.lo `test -f 'runtime/mem.c' || echo '$(srcdir)/'`runtime/mem.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mem.Tpo $(DEPDIR)/mem.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mem.c' object='mem.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mem.lo `test -f 'runtime/mem.c' || echo '$(srcdir)/'`runtime/mem.c
-
-mfixalloc.lo: runtime/mfixalloc.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mfixalloc.lo -MD -MP -MF $(DEPDIR)/mfixalloc.Tpo -c -o mfixalloc.lo `test -f 'runtime/mfixalloc.c' || echo '$(srcdir)/'`runtime/mfixalloc.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mfixalloc.Tpo $(DEPDIR)/mfixalloc.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mfixalloc.c' object='mfixalloc.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mfixalloc.lo `test -f 'runtime/mfixalloc.c' || echo '$(srcdir)/'`runtime/mfixalloc.c
-
-mgc0.lo: runtime/mgc0.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mgc0.lo -MD -MP -MF $(DEPDIR)/mgc0.Tpo -c -o mgc0.lo `test -f 'runtime/mgc0.c' || echo '$(srcdir)/'`runtime/mgc0.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mgc0.Tpo $(DEPDIR)/mgc0.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mgc0.c' object='mgc0.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mgc0.lo `test -f 'runtime/mgc0.c' || echo '$(srcdir)/'`runtime/mgc0.c
-
-mheap.lo: runtime/mheap.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mheap.lo -MD -MP -MF $(DEPDIR)/mheap.Tpo -c -o mheap.lo `test -f 'runtime/mheap.c' || echo '$(srcdir)/'`runtime/mheap.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/mheap.Tpo $(DEPDIR)/mheap.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/mheap.c' object='mheap.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mheap.lo `test -f 'runtime/mheap.c' || echo '$(srcdir)/'`runtime/mheap.c
-
-msize.lo: runtime/msize.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT msize.lo -MD -MP -MF $(DEPDIR)/msize.Tpo -c -o msize.lo `test -f 'runtime/msize.c' || echo '$(srcdir)/'`runtime/msize.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/msize.Tpo $(DEPDIR)/msize.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/msize.c' object='msize.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o msize.lo `test -f 'runtime/msize.c' || echo '$(srcdir)/'`runtime/msize.c
-
 panic.lo: runtime/panic.c
 @am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT panic.lo -MD -MP -MF $(DEPDIR)/panic.Tpo -c -o panic.lo `test -f 'runtime/panic.c' || echo '$(srcdir)/'`runtime/panic.c
 @am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/panic.Tpo $(DEPDIR)/panic.Plo
@@ -1905,13 +1790,6 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o panic.lo `test -f 'runtime/panic.c' || echo '$(srcdir)/'`runtime/panic.c
 
-parfor.lo: runtime/parfor.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT parfor.lo -MD -MP -MF $(DEPDIR)/parfor.Tpo -c -o parfor.lo `test -f 'runtime/parfor.c' || echo '$(srcdir)/'`runtime/parfor.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/parfor.Tpo $(DEPDIR)/parfor.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/parfor.c' object='parfor.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o parfor.lo `test -f 'runtime/parfor.c' || echo '$(srcdir)/'`runtime/parfor.c
-
 print.lo: runtime/print.c
 @am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT print.lo -MD -MP -MF $(DEPDIR)/print.Tpo -c -o print.lo `test -f 'runtime/print.c' || echo '$(srcdir)/'`runtime/print.c
 @am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/print.Tpo $(DEPDIR)/print.Plo
@@ -1933,6 +1811,13 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o runtime_c.lo `test -f 'runtime/runtime_c.c' || echo '$(srcdir)/'`runtime/runtime_c.c
 
+stack.lo: runtime/stack.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT stack.lo -MD -MP -MF $(DEPDIR)/stack.Tpo -c -o stack.lo `test -f 'runtime/stack.c' || echo '$(srcdir)/'`runtime/stack.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/stack.Tpo $(DEPDIR)/stack.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='runtime/stack.c' object='stack.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o stack.lo `test -f 'runtime/stack.c' || echo '$(srcdir)/'`runtime/stack.c
+
 thread.lo: runtime/thread.c
 @am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT thread.lo -MD -MP -MF $(DEPDIR)/thread.Tpo -c -o thread.lo `test -f 'runtime/thread.c' || echo '$(srcdir)/'`runtime/thread.c
 @am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/thread.Tpo $(DEPDIR)/thread.Plo
@@ -3138,10 +3023,6 @@
 goc2c: goc2c.$(OBJEXT)
 	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) -o $@ $<
 
-malloc.c: $(srcdir)/runtime/malloc.goc goc2c
-	./goc2c $< > $@.tmp
-	mv -f $@.tmp $@
-
 %.c: $(srcdir)/runtime/%.goc goc2c
 	./goc2c $< > $@.tmp
 	mv -f $@.tmp $@
@@ -3219,7 +3100,7 @@
 runtime.inc: s-runtime-inc; @true
 s-runtime-inc: runtime.lo Makefile
 	rm -f runtime.inc.tmp2
-	grep -v "#define _" runtime.inc.tmp | grep -v "#define [cm][01234] " > runtime.inc.tmp2
+	grep -v "#define _" runtime.inc.tmp | grep -v "#define [cm][01234] " | grep -v "#define empty " > runtime.inc.tmp2
 	for pattern in '_[GP][a-z]' _Max _Lock _Sig _Trace _MHeap _Num; do \
 	  grep "#define $$pattern" runtime.inc.tmp >> runtime.inc.tmp2; \
 	done
diff --git a/libgo/go/reflect/type.go b/libgo/go/reflect/type.go
index 905c867..3ae0f18 100644
--- a/libgo/go/reflect/type.go
+++ b/libgo/go/reflect/type.go
@@ -383,24 +383,6 @@
 	fields []structField // sorted by offset
 }
 
-// NOTE: These are copied from ../runtime/mgc0.h.
-// They must be kept in sync.
-const (
-	_GC_END = iota
-	_GC_PTR
-	_GC_APTR
-	_GC_ARRAY_START
-	_GC_ARRAY_NEXT
-	_GC_CALL
-	_GC_CHAN_PTR
-	_GC_STRING
-	_GC_EFACE
-	_GC_IFACE
-	_GC_SLICE
-	_GC_REGION
-	_GC_NUM_INSTR
-)
-
 /*
  * The compiler knows the exact layout of all the data structures above.
  * The compiler does not know about the data structures and methods below.
@@ -1099,32 +1081,6 @@
 	m map[*rtype]*ptrType
 }
 
-// garbage collection bytecode program for pointer to memory without pointers.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type ptrDataGC struct {
-	width uintptr // sizeof(ptr)
-	op    uintptr // _GC_APTR
-	off   uintptr // 0
-	end   uintptr // _GC_END
-}
-
-var ptrDataGCProg = ptrDataGC{
-	width: unsafe.Sizeof((*byte)(nil)),
-	op:    _GC_APTR,
-	off:   0,
-	end:   _GC_END,
-}
-
-// garbage collection bytecode program for pointer to memory with pointers.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type ptrGC struct {
-	width  uintptr        // sizeof(ptr)
-	op     uintptr        // _GC_PTR
-	off    uintptr        // 0
-	elemgc unsafe.Pointer // element gc type
-	end    uintptr        // _GC_END
-}
-
 // PtrTo returns the pointer type with element t.
 // For example, if t represents type Foo, PtrTo(t) represents *Foo.
 func PtrTo(t Type) Type {
@@ -1190,18 +1146,6 @@
 	pp.ptrToThis = nil
 	pp.elem = t
 
-	if t.kind&kindNoPointers != 0 {
-		pp.gcdata = (*byte)(unsafe.Pointer(&ptrDataGCProg))
-	} else {
-		pp.gcdata = (*byte)(unsafe.Pointer(&ptrGC{
-			width:  pp.size,
-			op:     _GC_PTR,
-			off:    0,
-			elemgc: unsafe.Pointer(t.gcdata),
-			end:    _GC_END,
-		}))
-	}
-
 	q := canonicalize(&pp.rtype)
 	p = (*ptrType)(unsafe.Pointer(q.(*rtype)))
 
@@ -1508,16 +1452,6 @@
 	return t
 }
 
-// garbage collection bytecode program for chan.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type chanGC struct {
-	width uintptr // sizeof(map)
-	op    uintptr // _GC_CHAN_PTR
-	off   uintptr // 0
-	typ   *rtype  // map type
-	end   uintptr // _GC_END
-}
-
 // The funcLookupCache caches FuncOf lookups.
 // FuncOf does not share the common lookupCache since cacheKey is not
 // sufficient to represent functions unambiguously.
@@ -1585,17 +1519,6 @@
 	ch.uncommonType = nil
 	ch.ptrToThis = nil
 
-	ch.gcdata = (*byte)(unsafe.Pointer(&chanGC{
-		width: ch.size,
-		op:    _GC_CHAN_PTR,
-		off:   0,
-		typ:   &ch.rtype,
-		end:   _GC_END,
-	}))
-
-	// INCORRECT. Uncomment to check that TestChanOfGC fails when ch.gcdata is wrong.
-	// ch.gcdata = unsafe.Pointer(&badGC{width: ch.size, end: _GC_END})
-
 	return cachePut(ckey, &ch.rtype)
 }
 
@@ -1734,9 +1657,6 @@
 	ft.uncommonType = nil
 	ft.ptrToThis = nil
 
-	// TODO(cmang): Generate GC data for funcs.
-	ft.gcdata = (*byte)(unsafe.Pointer(&ptrDataGCProg))
-
 	funcLookupCache.m[hash] = append(funcLookupCache.m[hash], &ft.rtype)
 
 	return toType(&ft.rtype)
@@ -1860,8 +1780,8 @@
 	// and it's easier to generate a pointer bitmap than a GC program.
 	// Note that since the key and value are known to be <= 128 bytes,
 	// they're guaranteed to have bitmaps instead of GC programs.
-	// var gcdata *byte
-	// var ptrdata uintptr
+	var gcdata *byte
+	var ptrdata uintptr
 
 	size := bucketSize
 	size = align(size, uintptr(ktyp.fieldAlign))
@@ -1876,37 +1796,63 @@
 	if maxAlign > ptrSize {
 		size = align(size, maxAlign)
 		size += align(ptrSize, maxAlign) - ptrSize
+	} else if maxAlign < ptrSize {
+		size = align(size, ptrSize)
+		maxAlign = ptrSize
 	}
 
 	ovoff := size
 	size += ptrSize
-	if maxAlign < ptrSize {
-		maxAlign = ptrSize
-	}
 
-	var gcPtr unsafe.Pointer
 	if kind != kindNoPointers {
-		gc := []uintptr{size}
-		base := bucketSize
-		base = align(base, uintptr(ktyp.fieldAlign))
+		nptr := size / ptrSize
+		mask := make([]byte, (nptr+7)/8)
+		psize := bucketSize
+		psize = align(psize, uintptr(ktyp.fieldAlign))
+		base := psize / ptrSize
+
 		if ktyp.kind&kindNoPointers == 0 {
-			gc = append(gc, _GC_ARRAY_START, base, bucketSize, ktyp.size)
-			gc = appendGCProgram(gc, ktyp, 0)
-			gc = append(gc, _GC_ARRAY_NEXT)
+			if ktyp.kind&kindGCProg != 0 {
+				panic("reflect: unexpected GC program in MapOf")
+			}
+			kmask := (*[16]byte)(unsafe.Pointer(ktyp.gcdata))
+			for i := uintptr(0); i < ktyp.size/ptrSize; i++ {
+				if (kmask[i/8]>>(i%8))&1 != 0 {
+					for j := uintptr(0); j < bucketSize; j++ {
+						word := base + j*ktyp.size/ptrSize + i
+						mask[word/8] |= 1 << (word % 8)
+					}
+				}
+			}
 		}
-		base += ktyp.size * bucketSize
-		base = align(base, uintptr(etyp.fieldAlign))
+		psize += bucketSize * ktyp.size
+		psize = align(psize, uintptr(etyp.fieldAlign))
+		base = psize / ptrSize
+
 		if etyp.kind&kindNoPointers == 0 {
-			gc = append(gc, _GC_ARRAY_START, base, bucketSize, etyp.size)
-			gc = appendGCProgram(gc, etyp, 0)
-			gc = append(gc, _GC_ARRAY_NEXT)
+			if etyp.kind&kindGCProg != 0 {
+				panic("reflect: unexpected GC program in MapOf")
+			}
+			emask := (*[16]byte)(unsafe.Pointer(etyp.gcdata))
+			for i := uintptr(0); i < etyp.size/ptrSize; i++ {
+				if (emask[i/8]>>(i%8))&1 != 0 {
+					for j := uintptr(0); j < bucketSize; j++ {
+						word := base + j*etyp.size/ptrSize + i
+						mask[word/8] |= 1 << (word % 8)
+					}
+				}
+			}
 		}
-		gc = append(gc, _GC_APTR, ovoff, _GC_END)
-		gcPtr = unsafe.Pointer(&gc[0])
-	} else {
-		// No pointers in bucket.
-		gc := [...]uintptr{size, _GC_END}
-		gcPtr = unsafe.Pointer(&gc[0])
+
+		word := ovoff / ptrSize
+		mask[word/8] |= 1 << (word % 8)
+		gcdata = &mask[0]
+		ptrdata = (word + 1) * ptrSize
+
+		// overflow word must be last
+		if ptrdata != size {
+			panic("reflect: bad layout computation in MapOf")
+		}
 	}
 
 	b := &rtype{
@@ -1914,102 +1860,14 @@
 		fieldAlign: uint8(maxAlign),
 		size:       size,
 		kind:       kind,
-		gcdata:     (*byte)(gcPtr),
+		ptrdata:    ptrdata,
+		gcdata:     gcdata,
 	}
 	s := "bucket(" + *ktyp.string + "," + *etyp.string + ")"
 	b.string = &s
 	return b
 }
 
-// Take the GC program for "t" and append it to the GC program "gc".
-func appendGCProgram(gc []uintptr, t *rtype, offset uintptr) []uintptr {
-	p := unsafe.Pointer(t.gcdata)
-	p = unsafe.Pointer(uintptr(p) + unsafe.Sizeof(uintptr(0))) // skip size
-loop:
-	for {
-		var argcnt int
-		switch *(*uintptr)(p) {
-		case _GC_END:
-			// Note: _GC_END not included in append
-			break loop
-		case _GC_ARRAY_NEXT:
-			argcnt = 0
-		case _GC_APTR, _GC_STRING, _GC_EFACE, _GC_IFACE:
-			argcnt = 1
-		case _GC_PTR, _GC_CALL, _GC_CHAN_PTR, _GC_SLICE:
-			argcnt = 2
-		case _GC_ARRAY_START, _GC_REGION:
-			argcnt = 3
-		default:
-			panic("unknown GC program op for " + *t.string + ": " + strconv.FormatUint(*(*uint64)(p), 10))
-		}
-		for i := 0; i < argcnt+1; i++ {
-			v := *(*uintptr)(p)
-			if i == 1 {
-				v += offset
-			}
-			gc = append(gc, v)
-			p = unsafe.Pointer(uintptr(p) + unsafe.Sizeof(uintptr(0)))
-		}
-	}
-	return gc
-}
-func hMapOf(bucket *rtype) *rtype {
-	ptrsize := unsafe.Sizeof(uintptr(0))
-
-	// make gc program & compute hmap size
-	gc := make([]uintptr, 1)           // first entry is size, filled in at the end
-	offset := unsafe.Sizeof(uint(0))   // count
-	offset += unsafe.Sizeof(uint32(0)) // flags
-	offset += unsafe.Sizeof(uint32(0)) // hash0
-	offset += unsafe.Sizeof(uint8(0))  // B
-	offset += unsafe.Sizeof(uint8(0))  // keysize
-	offset += unsafe.Sizeof(uint8(0))  // valuesize
-	offset = (offset + 1) / 2 * 2
-	offset += unsafe.Sizeof(uint16(0)) // bucketsize
-	offset = (offset + ptrsize - 1) / ptrsize * ptrsize
-	// gc = append(gc, _GC_PTR, offset, uintptr(bucket.gcdata)) // buckets
-	offset += ptrsize
-	// gc = append(gc, _GC_PTR, offset, uintptr(bucket.gcdata)) // oldbuckets
-	offset += ptrsize
-	offset += ptrsize // nevacuate
-	gc = append(gc, _GC_END)
-	gc[0] = offset
-
-	h := new(rtype)
-	h.size = offset
-	// h.gcdata = unsafe.Pointer(&gc[0])
-	s := "hmap(" + *bucket.string + ")"
-	h.string = &s
-	return h
-}
-
-// garbage collection bytecode program for slice of non-zero-length values.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type sliceGC struct {
-	width  uintptr        // sizeof(slice)
-	op     uintptr        // _GC_SLICE
-	off    uintptr        // 0
-	elemgc unsafe.Pointer // element gc program
-	end    uintptr        // _GC_END
-}
-
-// garbage collection bytecode program for slice of zero-length values.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type sliceEmptyGC struct {
-	width uintptr // sizeof(slice)
-	op    uintptr // _GC_APTR
-	off   uintptr // 0
-	end   uintptr // _GC_END
-}
-
-var sliceEmptyGCProg = sliceEmptyGC{
-	width: unsafe.Sizeof([]byte(nil)),
-	op:    _GC_APTR,
-	off:   0,
-	end:   _GC_END,
-}
-
 // SliceOf returns the slice type with element type t.
 // For example, if t represents int, SliceOf(t) represents []int.
 func SliceOf(t Type) Type {
@@ -2038,21 +1896,6 @@
 	slice.uncommonType = nil
 	slice.ptrToThis = nil
 
-	if typ.size == 0 {
-		slice.gcdata = (*byte)(unsafe.Pointer(&sliceEmptyGCProg))
-	} else {
-		slice.gcdata = (*byte)(unsafe.Pointer(&sliceGC{
-			width:  slice.size,
-			op:     _GC_SLICE,
-			off:    0,
-			elemgc: unsafe.Pointer(typ.gcdata),
-			end:    _GC_END,
-		}))
-	}
-
-	// INCORRECT. Uncomment to check that TestSliceOfOfGC fails when slice.gcdata is wrong.
-	// slice.gcdata = unsafe.Pointer(&badGC{width: slice.size, end: _GC_END})
-
 	return cachePut(ckey, &slice.rtype)
 }
 
@@ -2084,7 +1927,8 @@
 		repr = make([]byte, 0, 64)
 		fset = map[string]struct{}{} // fields' names
 
-		hasPtr = false // records whether at least one struct-field is a pointer
+		hasPtr    = false // records whether at least one struct-field is a pointer
+		hasGCProg = false // records whether a struct-field type has a GCProg
 	)
 
 	lastzero := uintptr(0)
@@ -2095,6 +1939,9 @@
 		}
 		f := runtimeStructField(field)
 		ft := f.typ
+		if ft.kind&kindGCProg != 0 {
+			hasGCProg = true
+		}
 		if ft.pointers() {
 			hasPtr = true
 		}
@@ -2234,18 +2081,69 @@
 	typ.fieldAlign = uint8(typalign)
 	if !hasPtr {
 		typ.kind |= kindNoPointers
-		gc := [...]uintptr{size, _GC_END}
-		typ.gcdata = (*byte)(unsafe.Pointer(&gc[0]))
 	} else {
 		typ.kind &^= kindNoPointers
-		gc := []uintptr{size}
-		for _, ft := range fs {
-			gc = appendGCProgram(gc, ft.typ, ft.offset)
-		}
-		gc = append(gc, _GC_END)
-		typ.gcdata = (*byte)(unsafe.Pointer(&gc[0]))
 	}
 
+	if hasGCProg {
+		lastPtrField := 0
+		for i, ft := range fs {
+			if ft.typ.pointers() {
+				lastPtrField = i
+			}
+		}
+		prog := []byte{0, 0, 0, 0} // will be length of prog
+		for i, ft := range fs {
+			if i > lastPtrField {
+				// gcprog should not include anything for any field after
+				// the last field that contains pointer data
+				break
+			}
+			// FIXME(sbinet) handle padding, fields smaller than a word
+			elemGC := (*[1 << 30]byte)(unsafe.Pointer(ft.typ.gcdata))[:]
+			elemPtrs := ft.typ.ptrdata / ptrSize
+			switch {
+			case ft.typ.kind&kindGCProg == 0 && ft.typ.ptrdata != 0:
+				// Element is small with pointer mask; use as literal bits.
+				mask := elemGC
+				// Emit 120-bit chunks of full bytes (max is 127 but we avoid using partial bytes).
+				var n uintptr
+				for n := elemPtrs; n > 120; n -= 120 {
+					prog = append(prog, 120)
+					prog = append(prog, mask[:15]...)
+					mask = mask[15:]
+				}
+				prog = append(prog, byte(n))
+				prog = append(prog, mask[:(n+7)/8]...)
+			case ft.typ.kind&kindGCProg != 0:
+				// Element has GC program; emit one element.
+				elemProg := elemGC[4 : 4+*(*uint32)(unsafe.Pointer(&elemGC[0]))-1]
+				prog = append(prog, elemProg...)
+			}
+			// Pad from ptrdata to size.
+			elemWords := ft.typ.size / ptrSize
+			if elemPtrs < elemWords {
+				// Emit literal 0 bit, then repeat as needed.
+				prog = append(prog, 0x01, 0x00)
+				if elemPtrs+1 < elemWords {
+					prog = append(prog, 0x81)
+					prog = appendVarint(prog, elemWords-elemPtrs-1)
+				}
+			}
+		}
+		*(*uint32)(unsafe.Pointer(&prog[0])) = uint32(len(prog) - 4)
+		typ.kind |= kindGCProg
+		typ.gcdata = &prog[0]
+	} else {
+		typ.kind &^= kindGCProg
+		bv := new(bitVector)
+		addTypeBits(bv, 0, typ.common())
+		if len(bv.data) > 0 {
+			typ.gcdata = &bv.data[0]
+		}
+	}
+	typ.ptrdata = typeptrdata(typ.common())
+
 	if hashable {
 		typ.hashfn = func(p unsafe.Pointer, seed uintptr) uintptr {
 			o := seed
@@ -2322,6 +2220,35 @@
 	}
 }
 
+// typeptrdata returns the length in bytes of the prefix of t
+// containing pointer data. Anything after this offset is scalar data.
+// keep in sync with ../cmd/compile/internal/gc/reflect.go
+func typeptrdata(t *rtype) uintptr {
+	if !t.pointers() {
+		return 0
+	}
+	switch t.Kind() {
+	case Struct:
+		st := (*structType)(unsafe.Pointer(t))
+		// find the last field that has pointers.
+		field := 0
+		for i := range st.fields {
+			ft := st.fields[i].typ
+			if ft.pointers() {
+				field = i
+			}
+		}
+		f := st.fields[field]
+		return f.offset + f.typ.ptrdata
+
+	default:
+		panic("reflect.typeptrdata: unexpected type, " + t.String())
+	}
+}
+
+// See cmd/compile/internal/gc/reflect.go for derivation of constant.
+const maxPtrmaskBytes = 2048
+
 // ArrayOf returns the array type with the given count and element type.
 // For example, if t represents int, ArrayOf(5, t) represents [5]int.
 //
@@ -2364,9 +2291,9 @@
 		panic("reflect.ArrayOf: array size would exceed virtual address space")
 	}
 	array.size = typ.size * uintptr(count)
-	// if count > 0 && typ.ptrdata != 0 {
-	// 	array.ptrdata = typ.size*uintptr(count-1) + typ.ptrdata
-	// }
+	if count > 0 && typ.ptrdata != 0 {
+		array.ptrdata = typ.size*uintptr(count-1) + typ.ptrdata
+	}
 	array.align = typ.align
 	array.fieldAlign = typ.fieldAlign
 	array.uncommonType = nil
@@ -2378,19 +2305,78 @@
 	case typ.kind&kindNoPointers != 0 || array.size == 0:
 		// No pointers.
 		array.kind |= kindNoPointers
-		gc := [...]uintptr{array.size, _GC_END}
-		array.gcdata = (*byte)(unsafe.Pointer(&gc[0]))
+		array.gcdata = nil
+		array.ptrdata = 0
 
 	case count == 1:
 		// In memory, 1-element array looks just like the element.
 		array.kind |= typ.kind & kindGCProg
 		array.gcdata = typ.gcdata
+		array.ptrdata = typ.ptrdata
+
+	case typ.kind&kindGCProg == 0 && array.size <= maxPtrmaskBytes*8*ptrSize:
+		// Element is small with pointer mask; array is still small.
+		// Create direct pointer mask by turning each 1 bit in elem
+		// into count 1 bits in larger mask.
+		mask := make([]byte, (array.ptrdata/ptrSize+7)/8)
+		elemMask := (*[1 << 30]byte)(unsafe.Pointer(typ.gcdata))[:]
+		elemWords := typ.size / ptrSize
+		for j := uintptr(0); j < typ.ptrdata/ptrSize; j++ {
+			if (elemMask[j/8]>>(j%8))&1 != 0 {
+				for i := uintptr(0); i < array.len; i++ {
+					k := i*elemWords + j
+					mask[k/8] |= 1 << (k % 8)
+				}
+			}
+		}
+		array.gcdata = &mask[0]
 
 	default:
-		gc := []uintptr{array.size, _GC_ARRAY_START, 0, uintptr(count), typ.size}
-		gc = appendGCProgram(gc, typ, 0)
-		gc = append(gc, _GC_ARRAY_NEXT, _GC_END)
-		array.gcdata = (*byte)(unsafe.Pointer(&gc[0]))
+		// Create program that emits one element
+		// and then repeats to make the array.
+		prog := []byte{0, 0, 0, 0} // will be length of prog
+		elemGC := (*[1 << 30]byte)(unsafe.Pointer(typ.gcdata))[:]
+		elemPtrs := typ.ptrdata / ptrSize
+		if typ.kind&kindGCProg == 0 {
+			// Element is small with pointer mask; use as literal bits.
+			mask := elemGC
+			// Emit 120-bit chunks of full bytes (max is 127 but we avoid using partial bytes).
+			var n uintptr
+			for n = elemPtrs; n > 120; n -= 120 {
+				prog = append(prog, 120)
+				prog = append(prog, mask[:15]...)
+				mask = mask[15:]
+			}
+			prog = append(prog, byte(n))
+			prog = append(prog, mask[:(n+7)/8]...)
+		} else {
+			// Element has GC program; emit one element.
+			elemProg := elemGC[4 : 4+*(*uint32)(unsafe.Pointer(&elemGC[0]))-1]
+			prog = append(prog, elemProg...)
+		}
+		// Pad from ptrdata to size.
+		elemWords := typ.size / ptrSize
+		if elemPtrs < elemWords {
+			// Emit literal 0 bit, then repeat as needed.
+			prog = append(prog, 0x01, 0x00)
+			if elemPtrs+1 < elemWords {
+				prog = append(prog, 0x81)
+				prog = appendVarint(prog, elemWords-elemPtrs-1)
+			}
+		}
+		// Repeat count-1 times.
+		if elemWords < 0x80 {
+			prog = append(prog, byte(elemWords|0x80))
+		} else {
+			prog = append(prog, 0x80)
+			prog = appendVarint(prog, elemWords)
+		}
+		prog = appendVarint(prog, uintptr(count)-1)
+		prog = append(prog, 0)
+		*(*uint32)(unsafe.Pointer(&prog[0])) = uint32(len(prog) - 4)
+		array.kind |= kindGCProg
+		array.gcdata = &prog[0]
+		array.ptrdata = array.size // overestimate but ok; must match program
 	}
 
 	array.kind &^= kindDirectIface
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
new file mode 100644
index 0000000..2e0e591
--- /dev/null
+++ b/libgo/go/runtime/cgocall.go
@@ -0,0 +1,307 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Cgo call and callback support.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Pointer checking for cgo code.
+
+// We want to detect all cases where a program that does not use
+// unsafe makes a cgo call passing a Go pointer to memory that
+// contains a Go pointer. Here a Go pointer is defined as a pointer
+// to memory allocated by the Go runtime. Programs that use unsafe
+// can evade this restriction easily, so we don't try to catch them.
+// The cgo program will rewrite all possibly bad pointer arguments to
+// call cgoCheckPointer, where we can catch cases of a Go pointer
+// pointing to a Go pointer.
+
+// Complicating matters, taking the address of a slice or array
+// element permits the C program to access all elements of the slice
+// or array. In that case we will see a pointer to a single element,
+// but we need to check the entire data structure.
+
+// The cgoCheckPointer call takes additional arguments indicating that
+// it was called on an address expression. An additional argument of
+// true means that it only needs to check a single element. An
+// additional argument of a slice or array means that it needs to
+// check the entire slice/array, but nothing else. Otherwise, the
+// pointer could be anything, and we check the entire heap object,
+// which is conservative but safe.
+
+// When and if we implement a moving garbage collector,
+// cgoCheckPointer will pin the pointer for the duration of the cgo
+// call.  (This is necessary but not sufficient; the cgo program will
+// also have to change to pin Go pointers that cannot point to Go
+// pointers.)
+
+// cgoCheckPointer checks if the argument contains a Go pointer that
+// points to a Go pointer, and panics if it does.
+func cgoCheckPointer(ptr interface{}, args ...interface{}) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := (*eface)(unsafe.Pointer(&ptr))
+	t := ep._type
+
+	top := true
+	if len(args) > 0 && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
+		p := ep.data
+		if t.kind&kindDirectIface == 0 {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		aep := (*eface)(unsafe.Pointer(&args[0]))
+		switch aep._type.kind & kindMask {
+		case kindBool:
+			if t.kind&kindMask == kindUnsafePointer {
+				// We don't know the type of the element.
+				break
+			}
+			pt := (*ptrtype)(unsafe.Pointer(t))
+			cgoCheckArg(pt.elem, p, true, false, cgoCheckPointerFail)
+			return
+		case kindSlice:
+			// Check the slice rather than the pointer.
+			ep = aep
+			t = ep._type
+		case kindArray:
+			// Check the array rather than the pointer.
+			// Pass top as false since we have a pointer
+			// to the array.
+			ep = aep
+			t = ep._type
+			top = false
+		default:
+			throw("can't happen")
+		}
+	}
+
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, top, cgoCheckPointerFail)
+}
+
+const cgoCheckPointerFail = "cgo argument has Go pointer to Go pointer"
+const cgoResultFail = "cgo result has Go pointer"
+
+// cgoCheckArg is the real work of cgoCheckPointer. The argument p
+// is either a pointer to the value (of type t), or the value itself,
+// depending on indir. The top parameter is whether we are at the top
+// level, where Go pointers are allowed.
+func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
+	if t.kind&kindNoPointers != 0 {
+		// If the type has no pointers there is nothing to do.
+		return
+	}
+
+	switch t.kind & kindMask {
+	default:
+		throw("can't happen")
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		if !indir {
+			if at.len != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(at.elem, p, at.elem.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for i := uintptr(0); i < at.len; i++ {
+			cgoCheckArg(at.elem, p, true, top, msg)
+			p = add(p, at.elem.size)
+		}
+	case kindChan, kindMap:
+		// These types contain internal pointers that will
+		// always be allocated in the Go heap. It's never OK
+		// to pass them to C.
+		panic(errorString(msg))
+	case kindFunc:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		panic(errorString(msg))
+	case kindInterface:
+		it := *(**_type)(p)
+		if it == nil {
+			return
+		}
+		// A type known at compile time is OK since it's
+		// constant. A type not known at compile time will be
+		// in the heap and will not be OK.
+		if inheap(uintptr(unsafe.Pointer(it))) {
+			panic(errorString(msg))
+		}
+		p = *(*unsafe.Pointer)(add(p, sys.PtrSize))
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		cgoCheckArg(it, p, it.kind&kindDirectIface == 0, false, msg)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		s := (*slice)(p)
+		p = s.array
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		if st.elem.kind&kindNoPointers != 0 {
+			return
+		}
+		for i := 0; i < s.cap; i++ {
+			cgoCheckArg(st.elem, p, true, false, msg)
+			p = add(p, st.elem.size)
+		}
+	case kindString:
+		ss := (*stringStruct)(p)
+		if !cgoIsGoPointer(ss.str) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		if !indir {
+			if len(st.fields) != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(st.fields[0].typ, p, st.fields[0].typ.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for _, f := range st.fields {
+			cgoCheckArg(f.typ, add(p, f.offset), true, top, msg)
+		}
+	case kindPtr, kindUnsafePointer:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+
+		cgoCheckUnknownPointer(p, msg)
+	}
+}
+
+// cgoCheckUnknownPointer is called for an arbitrary pointer into Go
+// memory. It checks whether that Go memory contains any other
+// pointer into Go memory. If it does, we panic.
+// The return values are unused but useful to see in panic tracebacks.
+func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
+	if cgoInRange(p, mheap_.arena_start, mheap_.arena_used) {
+		if !inheap(uintptr(p)) {
+			// On 32-bit systems it is possible for C's allocated memory
+			// to have addresses between arena_start and arena_used.
+			// Either this pointer is a stack or an unused span or it's
+			// a C allocation. Escape analysis should prevent the first,
+			// garbage collection should prevent the second,
+			// and the third is completely OK.
+			return
+		}
+
+		b, hbits, span, _ := heapBitsForObject(uintptr(p), 0, 0, false)
+		base = b
+		if base == 0 {
+			return
+		}
+		n := span.elemsize
+		for i = uintptr(0); i < n; i += sys.PtrSize {
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
+				// No more possible pointers.
+				break
+			}
+			if hbits.isPointer() {
+				if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+					panic(errorString(msg))
+				}
+			}
+			hbits = hbits.next()
+		}
+
+		return
+	}
+
+	roots := gcRoots
+	for roots != nil {
+		for j := 0; j < roots.count; j++ {
+			pr := roots.roots[j]
+			addr := uintptr(pr.decl)
+			if cgoInRange(p, addr, addr+pr.size) {
+				cgoCheckBits(pr.decl, pr.gcdata, 0, pr.ptrdata)
+				return
+			}
+		}
+		roots = roots.next
+	}
+
+	return
+}
+
+// cgoIsGoPointer returns whether the pointer is a Go pointer--a
+// pointer to Go memory. We only care about Go memory that might
+// contain pointers.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoIsGoPointer(p unsafe.Pointer) bool {
+	if p == nil {
+		return false
+	}
+
+	if inHeapOrStack(uintptr(p)) {
+		return true
+	}
+
+	roots := gcRoots
+	for roots != nil {
+		for i := 0; i < roots.count; i++ {
+			pr := roots.roots[i]
+			addr := uintptr(pr.decl)
+			if cgoInRange(p, addr, addr+pr.size) {
+				return true
+			}
+		}
+		roots = roots.next
+	}
+
+	return false
+}
+
+// cgoInRange returns whether p is between start and end.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoInRange(p unsafe.Pointer, start, end uintptr) bool {
+	return start <= uintptr(p) && uintptr(p) < end
+}
+
+// cgoCheckResult is called to check the result parameter of an
+// exported Go function. It panics if the result is or contains a Go
+// pointer.
+func cgoCheckResult(val interface{}) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := (*eface)(unsafe.Pointer(&val))
+	t := ep._type
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, false, cgoResultFail)
+}
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index fec3646..09d444d 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ignore
-
 // Code to check that pointer writes follow the cgo rules.
 // These functions are invoked via the write barrier when debug.cgocheck > 1.
 
@@ -110,17 +108,18 @@
 	}
 
 	// The type has a GC program. Try to find GC bits somewhere else.
-	for _, datap := range activeModules() {
-		if cgoInRange(src, datap.data, datap.edata) {
-			doff := uintptr(src) - datap.data
-			cgoCheckBits(add(src, -doff), datap.gcdatamask.bytedata, off+doff, size)
-			return
+	roots := gcRoots
+	for roots != nil {
+		for i := 0; i < roots.count; i++ {
+			pr := roots.roots[i]
+			addr := uintptr(pr.decl)
+			if cgoInRange(src, addr, addr+pr.size) {
+				doff := uintptr(src) - addr
+				cgoCheckBits(add(src, -doff), pr.gcdata, off+doff, size)
+				return
+			}
 		}
-		if cgoInRange(src, datap.bss, datap.ebss) {
-			boff := uintptr(src) - datap.bss
-			cgoCheckBits(add(src, -boff), datap.gcbssmask.bytedata, off+boff, size)
-			return
-		}
+		roots = roots.next
 	}
 
 	aoff := uintptr(src) - mheap_.arena_start
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 69a13ac..bf435f4 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -183,7 +183,6 @@
 
 // For benchmarking.
 
-/*
 func BenchSetType(n int, x interface{}) {
 	e := *efaceOf(&x)
 	t := e._type
@@ -214,7 +213,6 @@
 const PtrSize = sys.PtrSize
 
 var ForceGCPeriod = &forcegcperiod
-*/
 
 // SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
 // the "environment" traceback level, so later calls to
@@ -227,7 +225,6 @@
 var ReadUnaligned32 = readUnaligned32
 var ReadUnaligned64 = readUnaligned64
 
-/*
 func CountPagesInUse() (pagesInUse, counted uintptr) {
 	stopTheWorld("CountPagesInUse")
 
@@ -243,7 +240,6 @@
 
 	return
 }
-*/
 
 // BlockOnSystemStack switches to the system stack, prints "x\n" to
 // stderr, and blocks in a stack containing
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index 2a6acf0..ec043ed 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -434,8 +434,6 @@
 	}
 }
 
-/*
-
 func TestPageAccounting(t *testing.T) {
 	// Grow the heap in small increments. This used to drop the
 	// pages-in-use count below zero because of a rounding
@@ -452,5 +450,3 @@
 		t.Fatalf("mheap_.pagesInUse is %d, but direct count is %d", pagesInUse, counted)
 	}
 }
-
-*/
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
new file mode 100644
index 0000000..0db53f5
--- /dev/null
+++ b/libgo/go/runtime/heapdump.go
@@ -0,0 +1,594 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of runtime/debug.WriteHeapDump. Writes all
+// objects in the heap plus additional info (roots, threads,
+// finalizers, etc.) to a file.
+
+// The format of the dumped file is described at
+// https://golang.org/s/go15heapdump.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+//go:linkname runtime_debug_WriteHeapDump runtime_debug.WriteHeapDump
+func runtime_debug_WriteHeapDump(fd uintptr) {
+	stopTheWorld("write heap dump")
+
+	systemstack(func() {
+		writeheapdump_m(fd)
+	})
+
+	startTheWorld()
+}
+
+const (
+	fieldKindEol       = 0
+	fieldKindPtr       = 1
+	fieldKindIface     = 2
+	fieldKindEface     = 3
+	tagEOF             = 0
+	tagObject          = 1
+	tagOtherRoot       = 2
+	tagType            = 3
+	tagGoroutine       = 4
+	tagStackFrame      = 5
+	tagParams          = 6
+	tagFinalizer       = 7
+	tagItab            = 8
+	tagOSThread        = 9
+	tagMemStats        = 10
+	tagQueuedFinalizer = 11
+	tagData            = 12
+	tagBSS             = 13
+	tagDefer           = 14
+	tagPanic           = 15
+	tagMemProf         = 16
+	tagAllocSample     = 17
+)
+
+var dumpfd uintptr // fd to write the dump to.
+var tmpbuf []byte
+
+// buffer of pending write data
+const (
+	bufSize = 4096
+)
+
+var buf [bufSize]byte
+var nbuf uintptr
+
+func dwrite(data unsafe.Pointer, len uintptr) {
+	if len == 0 {
+		return
+	}
+	if nbuf+len <= bufSize {
+		copy(buf[nbuf:], (*[bufSize]byte)(data)[:len])
+		nbuf += len
+		return
+	}
+
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
+	if len >= bufSize {
+		write(dumpfd, data, int32(len))
+		nbuf = 0
+	} else {
+		copy(buf[:], (*[bufSize]byte)(data)[:len])
+		nbuf = len
+	}
+}
+
+func dwritebyte(b byte) {
+	dwrite(unsafe.Pointer(&b), 1)
+}
+
+func flush() {
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
+	nbuf = 0
+}
+
+// Cache of types that have been serialized already.
+// We use a type's hash field to pick a bucket.
+// Inside a bucket, we keep a list of types that
+// have been serialized so far, most recently used first.
+// Note: when a bucket overflows we may end up
+// serializing a type more than once. That's ok.
+const (
+	typeCacheBuckets = 256
+	typeCacheAssoc   = 4
+)
+
+type typeCacheBucket struct {
+	t [typeCacheAssoc]*_type
+}
+
+var typecache [typeCacheBuckets]typeCacheBucket
+
+// dump a uint64 in a varint format parseable by encoding/binary
+func dumpint(v uint64) {
+	var buf [10]byte
+	var n int
+	for v >= 0x80 {
+		buf[n] = byte(v | 0x80)
+		n++
+		v >>= 7
+	}
+	buf[n] = byte(v)
+	n++
+	dwrite(unsafe.Pointer(&buf), uintptr(n))
+}
+
+func dumpbool(b bool) {
+	if b {
+		dumpint(1)
+	} else {
+		dumpint(0)
+	}
+}
+
+// dump varint uint64 length followed by memory contents
+func dumpmemrange(data unsafe.Pointer, len uintptr) {
+	dumpint(uint64(len))
+	dwrite(data, len)
+}
+
+func dumpslice(b []byte) {
+	dumpint(uint64(len(b)))
+	if len(b) > 0 {
+		dwrite(unsafe.Pointer(&b[0]), uintptr(len(b)))
+	}
+}
+
+func dumpstr(s string) {
+	sp := stringStructOf(&s)
+	dumpmemrange(sp.str, uintptr(sp.len))
+}
+
+// dump information for a type
+func dumptype(t *_type) {
+	if t == nil {
+		return
+	}
+
+	// If we've definitely serialized the type before,
+	// no need to do it again.
+	b := &typecache[t.hash&(typeCacheBuckets-1)]
+	if t == b.t[0] {
+		return
+	}
+	for i := 1; i < typeCacheAssoc; i++ {
+		if t == b.t[i] {
+			// Move-to-front
+			for j := i; j > 0; j-- {
+				b.t[j] = b.t[j-1]
+			}
+			b.t[0] = t
+			return
+		}
+	}
+
+	// Might not have been dumped yet. Dump it and
+	// remember we did so.
+	for j := typeCacheAssoc - 1; j > 0; j-- {
+		b.t[j] = b.t[j-1]
+	}
+	b.t[0] = t
+
+	// dump the type
+	dumpint(tagType)
+	dumpint(uint64(uintptr(unsafe.Pointer(t))))
+	dumpint(uint64(t.size))
+	if x := t.uncommontype; x == nil || t.pkgPath == nil || *t.pkgPath == "" {
+		dumpstr(*t.string)
+	} else {
+		pkgpathstr := *t.pkgPath
+		pkgpath := stringStructOf(&pkgpathstr)
+		namestr := *t.name
+		name := stringStructOf(&namestr)
+		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
+		dwrite(pkgpath.str, uintptr(pkgpath.len))
+		dwritebyte('.')
+		dwrite(name.str, uintptr(name.len))
+	}
+	dumpbool(t.kind&kindDirectIface == 0 || t.kind&kindNoPointers == 0)
+}
+
+// dump an object
+func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
+	dumpbvtypes(&bv, obj)
+	dumpint(tagObject)
+	dumpint(uint64(uintptr(obj)))
+	dumpmemrange(obj, size)
+	dumpfields(bv)
+}
+
+func dumpotherroot(description string, to unsafe.Pointer) {
+	dumpint(tagOtherRoot)
+	dumpstr(description)
+	dumpint(uint64(uintptr(to)))
+}
+
+func dumpfinalizer(obj unsafe.Pointer, fn *funcval, ft *functype, ot *ptrtype) {
+	dumpint(tagFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ft))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+type childInfo struct {
+	// Information passed up from the callee frame about
+	// the layout of the outargs region.
+	argoff uintptr   // where the arguments start in the frame
+	arglen uintptr   // size of args region
+	args   bitvector // if args.n >= 0, pointer map of args region
+	sp     *uint8    // callee sp
+	depth  uintptr   // depth in call stack (0 == most recent)
+}
+
+// dump kinds & offsets of interesting fields in bv
+func dumpbv(cbv *bitvector, offset uintptr) {
+	bv := gobv(*cbv)
+	for i := uintptr(0); i < bv.n; i++ {
+		if bv.bytedata[i/8]>>(i%8)&1 == 1 {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(offset + i*sys.PtrSize))
+		}
+	}
+}
+
+func dumpgoroutine(gp *g) {
+	sp := gp.syscallsp
+
+	dumpint(tagGoroutine)
+	dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+	dumpint(uint64(sp))
+	dumpint(uint64(gp.goid))
+	dumpint(uint64(gp.gopc))
+	dumpint(uint64(readgstatus(gp)))
+	dumpbool(isSystemGoroutine(gp))
+	dumpbool(false) // isbackground
+	dumpint(uint64(gp.waitsince))
+	dumpstr(gp.waitreason)
+	dumpint(0)
+	dumpint(uint64(uintptr(unsafe.Pointer(gp.m))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._defer))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._panic))))
+
+	// dump defer & panic records
+	for d := gp._defer; d != nil; d = d.link {
+		dumpint(tagDefer)
+		dumpint(uint64(uintptr(unsafe.Pointer(d))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		dumpint(0)
+		dumpint(0)
+		dumpint(uint64(uintptr(unsafe.Pointer(d.pfn))))
+		dumpint(0)
+		dumpint(uint64(uintptr(unsafe.Pointer(d.link))))
+	}
+	for p := gp._panic; p != nil; p = p.link {
+		dumpint(tagPanic)
+		dumpint(uint64(uintptr(unsafe.Pointer(p))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		eface := efaceOf(&p.arg)
+		dumpint(uint64(uintptr(unsafe.Pointer(eface._type))))
+		dumpint(uint64(uintptr(unsafe.Pointer(eface.data))))
+		dumpint(0) // was p->defer, no longer recorded
+		dumpint(uint64(uintptr(unsafe.Pointer(p.link))))
+	}
+}
+
+func dumpgs() {
+	// goroutines & stacks
+	for i := 0; uintptr(i) < allglen; i++ {
+		gp := allgs[i]
+		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
+		switch status {
+		default:
+			print("runtime: unexpected G.status ", hex(status), "\n")
+			throw("dumpgs in STW - bad status")
+		case _Gdead:
+			// ok
+		case _Grunnable,
+			_Gsyscall,
+			_Gwaiting:
+			dumpgoroutine(gp)
+		}
+	}
+}
+
+func finq_callback(fn *funcval, obj unsafe.Pointer, ft *functype, ot *ptrtype) {
+	dumpint(tagQueuedFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ft))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+func dumproots() {
+	// MSpan.types
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			// Finalizers
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				p := unsafe.Pointer(s.base() + uintptr(spf.special.offset))
+				dumpfinalizer(p, spf.fn, spf.ft, spf.ot)
+			}
+		}
+	}
+
+	// Finalizer queue
+	iterate_finq(finq_callback)
+}
+
+// Bit vector of free marks.
+// Needs to be as big as the largest number of objects per span.
+var freemark [_PageSize / 8]bool
+
+func dumpobjs() {
+	for _, s := range mheap_.allspans {
+		if s.state != _MSpanInUse {
+			continue
+		}
+		p := s.base()
+		size := s.elemsize
+		n := (s.npages << _PageShift) / size
+		if n > uintptr(len(freemark)) {
+			throw("freemark array doesn't have enough entries")
+		}
+
+		for freeIndex := uintptr(0); freeIndex < s.nelems; freeIndex++ {
+			if s.isFree(freeIndex) {
+				freemark[freeIndex] = true
+			}
+		}
+
+		for j := uintptr(0); j < n; j, p = j+1, p+size {
+			if freemark[j] {
+				freemark[j] = false
+				continue
+			}
+			dumpobj(unsafe.Pointer(p), size, makeheapobjbv(p, size))
+		}
+	}
+}
+
+func dumpparams() {
+	dumpint(tagParams)
+	x := uintptr(1)
+	if *(*byte)(unsafe.Pointer(&x)) == 1 {
+		dumpbool(false) // little-endian ptrs
+	} else {
+		dumpbool(true) // big-endian ptrs
+	}
+	dumpint(sys.PtrSize)
+	dumpint(uint64(mheap_.arena_start))
+	dumpint(uint64(mheap_.arena_used))
+	dumpstr(sys.GOARCH)
+	dumpstr(sys.Goexperiment)
+	dumpint(uint64(ncpu))
+}
+
+func dumpms() {
+	for mp := allm; mp != nil; mp = mp.alllink {
+		dumpint(tagOSThread)
+		dumpint(uint64(uintptr(unsafe.Pointer(mp))))
+		dumpint(uint64(mp.id))
+		dumpint(mp.procid)
+	}
+}
+
+func dumpmemstats() {
+	dumpint(tagMemStats)
+	dumpint(memstats.alloc)
+	dumpint(memstats.total_alloc)
+	dumpint(memstats.sys)
+	dumpint(memstats.nlookup)
+	dumpint(memstats.nmalloc)
+	dumpint(memstats.nfree)
+	dumpint(memstats.heap_alloc)
+	dumpint(memstats.heap_sys)
+	dumpint(memstats.heap_idle)
+	dumpint(memstats.heap_inuse)
+	dumpint(memstats.heap_released)
+	dumpint(memstats.heap_objects)
+	dumpint(memstats.stacks_inuse)
+	dumpint(memstats.stacks_sys)
+	dumpint(memstats.mspan_inuse)
+	dumpint(memstats.mspan_sys)
+	dumpint(memstats.mcache_inuse)
+	dumpint(memstats.mcache_sys)
+	dumpint(memstats.buckhash_sys)
+	dumpint(memstats.gc_sys)
+	dumpint(memstats.other_sys)
+	dumpint(memstats.next_gc)
+	dumpint(memstats.last_gc)
+	dumpint(memstats.pause_total_ns)
+	for i := 0; i < 256; i++ {
+		dumpint(memstats.pause_ns[i])
+	}
+	dumpint(uint64(memstats.numgc))
+}
+
+func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *location, size, allocs, frees uintptr) {
+	stk := (*[100000]location)(unsafe.Pointer(pstk))
+	dumpint(tagMemProf)
+	dumpint(uint64(uintptr(unsafe.Pointer(b))))
+	dumpint(uint64(size))
+	dumpint(uint64(nstk))
+	for i := uintptr(0); i < nstk; i++ {
+		pc := stk[i].pc
+		fn := stk[i].function
+		file := stk[i].filename
+		line := stk[i].lineno
+		if fn == "" {
+			var buf [64]byte
+			n := len(buf)
+			n--
+			buf[n] = ')'
+			if pc == 0 {
+				n--
+				buf[n] = '0'
+			} else {
+				for pc > 0 {
+					n--
+					buf[n] = "0123456789abcdef"[pc&15]
+					pc >>= 4
+				}
+			}
+			n--
+			buf[n] = 'x'
+			n--
+			buf[n] = '0'
+			n--
+			buf[n] = '('
+			dumpslice(buf[n:])
+			dumpstr("?")
+			dumpint(0)
+		} else {
+			dumpstr(fn)
+			dumpstr(file)
+			dumpint(uint64(line))
+		}
+	}
+	dumpint(uint64(allocs))
+	dumpint(uint64(frees))
+}
+
+func dumpmemprof() {
+	iterate_memprof(dumpmemprof_callback)
+	for _, s := range mheap_.allspans {
+		if s.state != _MSpanInUse {
+			continue
+		}
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialProfile {
+				continue
+			}
+			spp := (*specialprofile)(unsafe.Pointer(sp))
+			p := s.base() + uintptr(spp.special.offset)
+			dumpint(tagAllocSample)
+			dumpint(uint64(p))
+			dumpint(uint64(uintptr(unsafe.Pointer(spp.b))))
+		}
+	}
+}
+
+var dumphdr = []byte("go1.7 heap dump\n")
+
+func mdump() {
+	// make sure we're done sweeping
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			s.ensureSwept()
+		}
+	}
+	memclrNoHeapPointers(unsafe.Pointer(&typecache), unsafe.Sizeof(typecache))
+	dwrite(unsafe.Pointer(&dumphdr[0]), uintptr(len(dumphdr)))
+	dumpparams()
+	dumpobjs()
+	dumpgs()
+	dumpms()
+	dumproots()
+	dumpmemstats()
+	dumpmemprof()
+	dumpint(tagEOF)
+	flush()
+}
+
+func writeheapdump_m(fd uintptr) {
+	_g_ := getg()
+	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
+	_g_.waitreason = "dumping heap"
+
+	// Update stats so we can dump them.
+	// As a side effect, flushes all the MCaches so the MSpan.freelist
+	// lists contain all the free objects.
+	updatememstats(nil)
+
+	// Set dump file.
+	dumpfd = fd
+
+	// Call dump routine.
+	mdump()
+
+	// Reset dump file.
+	dumpfd = 0
+	if tmpbuf != nil {
+		sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		tmpbuf = nil
+	}
+
+	casgstatus(_g_.m.curg, _Gwaiting, _Grunning)
+}
+
+// dumpint() the kind & offset of each field in an object.
+func dumpfields(bv bitvector) {
+	dumpbv(&bv, 0)
+	dumpint(fieldKindEol)
+}
+
+// The heap dump reader needs to be able to disambiguate
+// Eface entries. So it needs to know every type that might
+// appear in such an entry. The following routine accomplishes that.
+// TODO(rsc, khr): Delete - no longer possible.
+
+// Dump all the types that appear in the type field of
+// any Eface described by this bit vector.
+func dumpbvtypes(bv *bitvector, base unsafe.Pointer) {
+}
+
+func makeheapobjbv(p uintptr, size uintptr) bitvector {
+	// Extend the temp buffer if necessary.
+	nptr := size / sys.PtrSize
+	if uintptr(len(tmpbuf)) < nptr/8+1 {
+		if tmpbuf != nil {
+			sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		}
+		n := nptr/8 + 1
+		p := sysAlloc(n, &memstats.other_sys)
+		if p == nil {
+			throw("heapdump: out of memory")
+		}
+		tmpbuf = (*[1 << 30]byte)(p)[:n]
+	}
+	// Convert heap bitmap to pointer bitmap.
+	for i := uintptr(0); i < nptr/8+1; i++ {
+		tmpbuf[i] = 0
+	}
+	i := uintptr(0)
+	hbits := heapBitsForAddr(p)
+	for ; i < nptr; i++ {
+		if i != 1 && !hbits.morePointers() {
+			break // end of object
+		}
+		if hbits.isPointer() {
+			tmpbuf[i/8] |= 1 << (i % 8)
+		}
+		hbits = hbits.next()
+	}
+	return bitvector{int32(i), &tmpbuf[0]}
+}
+
+type gobitvector struct {
+	n        uintptr
+	bytedata []uint8
+}
+
+func gobv(bv bitvector) gobitvector {
+	return gobitvector{
+		uintptr(bv.n),
+		(*[1 << 30]byte)(unsafe.Pointer(bv.bytedata))[:(bv.n+7)/8],
+	}
+}
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
new file mode 100644
index 0000000..ed25782
--- /dev/null
+++ b/libgo/go/runtime/malloc.go
@@ -0,0 +1,998 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Memory allocator.
+//
+// This was originally based on tcmalloc, but has diverged quite a bit.
+// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+
+// The main allocator works in runs of pages.
+// Small allocation sizes (up to and including 32 kB) are
+// rounded to one of about 70 size classes, each of which
+// has its own free set of objects of exactly that size.
+// Any free page of memory can be split into a set of objects
+// of one size class, which are then managed using a free bitmap.
+//
+// The allocator's data structures are:
+//
+//	fixalloc: a free-list allocator for fixed-size off-heap objects,
+//		used to manage storage used by the allocator.
+//	mheap: the malloc heap, managed at page (8192-byte) granularity.
+//	mspan: a run of pages managed by the mheap.
+//	mcentral: collects all spans of a given size class.
+//	mcache: a per-P cache of mspans with free space.
+//	mstats: allocation statistics.
+//
+// Allocating a small object proceeds up a hierarchy of caches:
+//
+//	1. Round the size up to one of the small size classes
+//	   and look in the corresponding mspan in this P's mcache.
+//	   Scan the mspan's free bitmap to find a free slot.
+//	   If there is a free slot, allocate it.
+//	   This can all be done without acquiring a lock.
+//
+//	2. If the mspan has no free slots, obtain a new mspan
+//	   from the mcentral's list of mspans of the required size
+//	   class that have free space.
+//	   Obtaining a whole span amortizes the cost of locking
+//	   the mcentral.
+//
+//	3. If the mcentral's mspan list is empty, obtain a run
+//	   of pages from the mheap to use for the mspan.
+//
+//	4. If the mheap is empty or has no page runs large enough,
+//	   allocate a new group of pages (at least 1MB) from the
+//	   operating system. Allocating a large run of pages
+//	   amortizes the cost of talking to the operating system.
+//
+// Sweeping an mspan and freeing objects on it proceeds up a similar
+// hierarchy:
+//
+//	1. If the mspan is being swept in response to allocation, it
+//	   is returned to the mcache to satisfy the allocation.
+//
+//	2. Otherwise, if the mspan still has allocated objects in it,
+//	   it is placed on the mcentral free list for the mspan's size
+//	   class.
+//
+//	3. Otherwise, if all objects in the mspan are free, the mspan
+//	   is now "idle", so it is returned to the mheap and no longer
+//	   has a size class.
+//	   This may coalesce it with adjacent idle mspans.
+//
+//	4. If an mspan remains idle for long enough, return its pages
+//	   to the operating system.
+//
+// Allocating and freeing a large object uses the mheap
+// directly, bypassing the mcache and mcentral.
+//
+// Free object slots in an mspan are zeroed only if mspan.needzero is
+// false. If needzero is true, objects are zeroed as they are
+// allocated. There are various benefits to delaying zeroing this way:
+//
+//	1. Stack frame allocation can avoid zeroing altogether.
+//
+//	2. It exhibits better temporal locality, since the program is
+//	   probably about to write to the memory.
+//
+//	3. We don't zero pages that never get reused.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// C function to get the end of the program's memory.
+func getEnd() uintptr
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname newobject runtime.newobject
+
+// Functions called by C code.
+//go:linkname mallocgc runtime.mallocgc
+
+const (
+	debugMalloc = false
+
+	maxTinySize   = _TinySize
+	tinySizeClass = _TinySizeClass
+	maxSmallSize  = _MaxSmallSize
+
+	pageShift = _PageShift
+	pageSize  = _PageSize
+	pageMask  = _PageMask
+	// By construction, single page spans of the smallest object class
+	// have the most objects per span.
+	maxObjsPerSpan = pageSize / 8
+
+	mSpanInUse = _MSpanInUse
+
+	concurrentSweep = _ConcurrentSweep
+
+	_PageSize = 1 << _PageShift
+	_PageMask = _PageSize - 1
+
+	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+	_64bit = 1 << (^uintptr(0) >> 63) / 2
+
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
+	_TinySize      = 16
+	_TinySizeClass = 2
+
+	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
+	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+	_HeapAllocChunk = 1 << 20                // Chunk size for heap growth
+
+	// Per-P, per order stack segment cache size.
+	_StackCacheSize = 32 * 1024
+
+	// Number of orders that get caching. Order 0 is FixedStack
+	// and each successive order is twice as large.
+	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
+	// will be allocated directly.
+	// Since FixedStack is different on different systems, we
+	// must vary NumStackOrders to keep the same maximum cached size.
+	//   OS               | FixedStack | NumStackOrders
+	//   -----------------+------------+---------------
+	//   linux/darwin/bsd | 2KB        | 4
+	//   windows/32       | 4KB        | 3
+	//   windows/64       | 8KB        | 2
+	//   plan9            | 4KB        | 3
+	_NumStackOrders = 4 - sys.PtrSize/4*sys.GoosWindows - 1*sys.GoosPlan9
+
+	// Number of bits in page to span calculations (4k pages).
+	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
+	// Windows counts memory used by page table into committed memory
+	// of the process, so we can't reserve too much memory.
+	// See https://golang.org/issue/5402 and https://golang.org/issue/5236.
+	// On other 64-bit platforms, we limit the arena to 512GB, or 39 bits.
+	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+	// The only exception is mips32 which only has access to low 2GB of virtual memory.
+	// On Darwin/arm64, we cannot reserve more than ~5GB of virtual memory,
+	// but as most devices have less than 4GB of physical memory anyway, we
+	// try to be conservative here, and only ask for a 2GB heap.
+	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))
+	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
+
+	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+
+	// Max number of threads to run garbage collection.
+	// 2, 3, and 4 are all plausible maximums depending
+	// on the hardware details of the machine. The garbage
+	// collector scales well to 32 cpus.
+	_MaxGcproc = 32
+
+	_MaxArena32 = 1<<32 - 1
+
+	// minLegalPointer is the smallest possible legal pointer.
+	// This is the smallest possible architectural page size,
+	// since we assume that the first page is never mapped.
+	//
+	// This should agree with minZeroPage in the compiler.
+	minLegalPointer uintptr = 4096
+)
+
+// physPageSize is the size in bytes of the OS's physical pages.
+// Mapping and unmapping operations must be done at multiples of
+// physPageSize.
+//
+// This must be set by the OS init code (typically in osinit) before
+// mallocinit.
+var physPageSize uintptr
+
+// OS-defined helpers:
+//
+// sysAlloc obtains a large chunk of zeroed memory from the
+// operating system, typically on the order of a hundred kilobytes
+// or a megabyte.
+// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysUnused notifies the operating system that the contents
+// of the memory region are no longer needed and can be reused
+// for other purposes.
+// SysUsed notifies the operating system that the contents
+// of the memory region are needed again.
+//
+// SysFree returns it unconditionally; this is only used if
+// an out-of-memory error has been detected midway through
+// an allocation. It is okay if SysFree is a no-op.
+//
+// SysReserve reserves address space without allocating memory.
+// If the pointer passed to it is non-nil, the caller wants the
+// reservation there, but SysReserve can still choose another
+// location if that one is unavailable. On some systems and in some
+// cases SysReserve will simply check that the address space is
+// available and not actually reserve it. If SysReserve returns
+// non-nil, it sets *reserved to true if the address space is
+// reserved, false if it has merely been checked.
+// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysMap maps previously reserved address space for use.
+// The reserved argument is true if the address space was really
+// reserved, not merely checked.
+//
+// SysFault marks a (already sysAlloc'd) region to fault
+// if accessed. Used only for debugging the runtime.
+
+func mallocinit() {
+	if class_to_size[_TinySizeClass] != _TinySize {
+		throw("bad TinySizeClass")
+	}
+
+	// Not used for gccgo.
+	// testdefersizes()
+
+	// Copy class sizes out for statistics table.
+	for i := range class_to_size {
+		memstats.by_size[i].size = uint32(class_to_size[i])
+	}
+
+	// Check physPageSize.
+	if physPageSize == 0 {
+		// The OS init code failed to fetch the physical page size.
+		throw("failed to get system page size")
+	}
+	if physPageSize < minPhysPageSize {
+		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
+		throw("bad system page size")
+	}
+	if physPageSize&(physPageSize-1) != 0 {
+		print("system page size (", physPageSize, ") must be a power of 2\n")
+		throw("bad system page size")
+	}
+
+	var p, bitmapSize, spansSize, pSize, limit uintptr
+	var reserved bool
+
+	// limit = runtime.memlimit();
+	// See https://golang.org/issue/5049
+	// TODO(rsc): Fix after 1.1.
+	limit = 0
+
+	// Set up the allocation arena, a contiguous area of memory where
+	// allocated data will be found. The arena begins with a bitmap large
+	// enough to hold 2 bits per allocated word.
+	if sys.PtrSize == 8 && (limit == 0 || limit > 1<<30) {
+		// On a 64-bit machine, allocate from a single contiguous reservation.
+		// 512 GB (MaxMem) should be big enough for now.
+		//
+		// The code will work with the reservation at any address, but ask
+		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
+		// Allocating a 512 GB region takes away 39 bits, and the amd64
+		// doesn't let us choose the top 17 bits, so that leaves the 9 bits
+		// in the middle of 0x00c0 for us to choose. Choosing 0x00c0 means
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
+		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from
+		// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
+		// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
+		// These choices are both for debuggability and to reduce the
+		// odds of a conservative garbage collector (as is still used in gccgo)
+		// not collecting memory because some non-pointer block of memory
+		// had a bit pattern that matched a memory address.
+		//
+		// Actually we reserve 544 GB (because the bitmap ends up being 32 GB)
+		// but it hardly matters: e0 00 is not valid UTF-8 either.
+		//
+		// If this fails we fall back to the 32 bit memory mechanism
+		//
+		// However, on arm64, we ignore all this advice above and slam the
+		// allocation at 0x40 << 32 because when using 4k pages with 3-level
+		// translation buffers, the user address space is limited to 39 bits
+		// On darwin/arm64, the address space is even smaller.
+		arenaSize := round(_MaxMem, _PageSize)
+		bitmapSize = arenaSize / (sys.PtrSize * 8 / 2)
+		spansSize = arenaSize / _PageSize * sys.PtrSize
+		spansSize = round(spansSize, _PageSize)
+		for i := 0; i <= 0x7f; i++ {
+			switch {
+			case GOARCH == "arm64" && GOOS == "darwin":
+				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
+			case GOARCH == "arm64":
+				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
+			default:
+				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
+			}
+			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+	}
+
+	if p == 0 {
+		// On a 32-bit machine, we can't typically get away
+		// with a giant virtual address space reservation.
+		// Instead we map the memory information bitmap
+		// immediately after the data segment, large enough
+		// to handle the entire 4GB address space (256 MB),
+		// along with a reservation for an initial arena.
+		// When that gets used up, we'll start asking the kernel
+		// for any memory anywhere.
+
+		// If we fail to allocate, try again with a smaller arena.
+		// This is necessary on Android L where we share a process
+		// with ART, which reserves virtual memory aggressively.
+		// In the worst case, fall back to a 0-sized initial arena,
+		// in the hope that subsequent reservations will succeed.
+		arenaSizes := [...]uintptr{
+			512 << 20,
+			256 << 20,
+			128 << 20,
+			0,
+		}
+
+		for _, arenaSize := range &arenaSizes {
+			bitmapSize = (_MaxArena32 + 1) / (sys.PtrSize * 8 / 2)
+			spansSize = (_MaxArena32 + 1) / _PageSize * sys.PtrSize
+			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
+				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
+				arenaSize = bitmapSize * 8
+				spansSize = arenaSize / _PageSize * sys.PtrSize
+			}
+			spansSize = round(spansSize, _PageSize)
+
+			// SysReserve treats the address we ask for, end, as a hint,
+			// not as an absolute requirement. If we ask for the end
+			// of the data segment but the operating system requires
+			// a little more space before we can start allocating, it will
+			// give out a slightly higher pointer. Except QEMU, which
+			// is buggy, as usual: it won't adjust the pointer upward.
+			// So adjust it upward a little bit ourselves: 1/4 MB to get
+			// away from the running binary image and then round up
+			// to a MB boundary.
+			p = round(getEnd()+(1<<18), 1<<20)
+			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+		if p == 0 {
+			throw("runtime: cannot reserve arena virtual address space")
+		}
+	}
+
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p1 := round(p, _PageSize)
+
+	spansStart := p1
+	mheap_.bitmap = p1 + spansSize + bitmapSize
+	if sys.PtrSize == 4 {
+		// Set arena_start such that we can accept memory
+		// reservations located anywhere in the 4GB virtual space.
+		mheap_.arena_start = 0
+	} else {
+		mheap_.arena_start = p1 + (spansSize + bitmapSize)
+	}
+	mheap_.arena_end = p + pSize
+	mheap_.arena_used = p1 + (spansSize + bitmapSize)
+	mheap_.arena_reserved = reserved
+
+	if mheap_.arena_start&(_PageSize-1) != 0 {
+		println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start))
+		throw("misrounded allocation in mallocinit")
+	}
+
+	// Initialize the rest of the allocator.
+	mheap_.init(spansStart, spansSize)
+	_g_ := getg()
+	_g_.m.mcache = allocmcache()
+}
+
+// sysAlloc allocates the next n bytes from the heap arena. The
+// returned pointer is always _PageSize aligned and between
+// h.arena_start and h.arena_end. sysAlloc returns nil on failure.
+// There is no corresponding free function.
+func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
+	if n > h.arena_end-h.arena_used {
+		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
+		// Reserve some more space.
+		p_size := round(n+_PageSize, 256<<20)
+		new_end := h.arena_end + p_size // Careful: can overflow
+		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxArena32 {
+			// TODO: It would be bad if part of the arena
+			// is reserved and part is not.
+			var reserved bool
+			p := uintptr(sysReserve(unsafe.Pointer(h.arena_end), p_size, &reserved))
+			if p == 0 {
+				return nil
+			}
+			if p == h.arena_end {
+				h.arena_end = new_end
+				h.arena_reserved = reserved
+			} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxArena32 {
+				// Keep everything page-aligned.
+				// Our pages are bigger than hardware pages.
+				h.arena_end = p + p_size
+				used := p + (-p & (_PageSize - 1))
+				h.mapBits(used)
+				h.mapSpans(used)
+				h.arena_used = used
+				h.arena_reserved = reserved
+			} else {
+				// We haven't added this allocation to
+				// the stats, so subtract it from a
+				// fake stat (but avoid underflow).
+				stat := uint64(p_size)
+				sysFree(unsafe.Pointer(p), p_size, &stat)
+			}
+		}
+	}
+
+	if n <= h.arena_end-h.arena_used {
+		// Keep taking from our reservation.
+		p := h.arena_used
+		sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
+		h.mapBits(p + n)
+		h.mapSpans(p + n)
+		h.arena_used = p + n
+		if raceenabled {
+			racemapshadow(unsafe.Pointer(p), n)
+		}
+
+		if p&(_PageSize-1) != 0 {
+			throw("misrounded allocation in MHeap_SysAlloc")
+		}
+		return unsafe.Pointer(p)
+	}
+
+	// If using 64-bit, our reservation is all we have.
+	if h.arena_end-h.arena_start > _MaxArena32 {
+		return nil
+	}
+
+	// On 32-bit, once the reservation is gone we can
+	// try to get memory at a location chosen by the OS.
+	p_size := round(n, _PageSize) + _PageSize
+	p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
+	if p == 0 {
+		return nil
+	}
+
+	if p < h.arena_start || p+p_size-h.arena_start > _MaxArena32 {
+		top := ^uintptr(0)
+		if top-h.arena_start-1 > _MaxArena32 {
+			top = h.arena_start + _MaxArena32 + 1
+		}
+		print("runtime: memory allocated by OS (", hex(p), ") not in usable range [", hex(h.arena_start), ",", hex(top), ")\n")
+		sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
+		return nil
+	}
+
+	p_end := p + p_size
+	p += -p & (_PageSize - 1)
+	if p+n > h.arena_used {
+		h.mapBits(p + n)
+		h.mapSpans(p + n)
+		h.arena_used = p + n
+		if p_end > h.arena_end {
+			h.arena_end = p_end
+		}
+		if raceenabled {
+			racemapshadow(unsafe.Pointer(p), n)
+		}
+	}
+
+	if p&(_PageSize-1) != 0 {
+		throw("misrounded allocation in MHeap_SysAlloc")
+	}
+	return unsafe.Pointer(p)
+}
+
+// base address for all 0-byte allocations
+var zerobase uintptr
+
+// nextFreeFast returns the next free object if one is quickly available.
+// Otherwise it returns 0.
+func nextFreeFast(s *mspan) gclinkptr {
+	theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
+	if theBit < 64 {
+		result := s.freeindex + uintptr(theBit)
+		if result < s.nelems {
+			freeidx := result + 1
+			if freeidx%64 == 0 && freeidx != s.nelems {
+				return 0
+			}
+			s.allocCache >>= (theBit + 1)
+			s.freeindex = freeidx
+			v := gclinkptr(result*s.elemsize + s.base())
+			s.allocCount++
+			return v
+		}
+	}
+	return 0
+}
+
+// nextFree returns the next free object from the cached span if one is available.
+// Otherwise it refills the cache with a span with an available object and
+// returns that object along with a flag indicating that this was a heavy
+// weight allocation. If it is a heavy weight allocation the caller must
+// determine whether a new GC cycle needs to be started or if the GC is active
+// whether this goroutine needs to assist the GC.
+func (c *mcache) nextFree(sizeclass uint8) (v gclinkptr, s *mspan, shouldhelpgc bool) {
+	s = c.alloc[sizeclass]
+	shouldhelpgc = false
+	freeIndex := s.nextFreeIndex()
+	if freeIndex == s.nelems {
+		// The span is full.
+		if uintptr(s.allocCount) != s.nelems {
+			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
+		}
+		systemstack(func() {
+			c.refill(int32(sizeclass))
+		})
+		shouldhelpgc = true
+		s = c.alloc[sizeclass]
+
+		freeIndex = s.nextFreeIndex()
+	}
+
+	if freeIndex >= s.nelems {
+		throw("freeIndex is not valid")
+	}
+
+	v = gclinkptr(freeIndex*s.elemsize + s.base())
+	s.allocCount++
+	if uintptr(s.allocCount) > s.nelems {
+		println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+		throw("s.allocCount > s.nelems")
+	}
+	return
+}
+
+// Allocate an object of size bytes.
+// Small objects are allocated from the per-P cache's free lists.
+// Large objects (> 32 kB) are allocated straight from the heap.
+func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
+	if gcphase == _GCmarktermination {
+		throw("mallocgc called with gcphase == _GCmarktermination")
+	}
+
+	if size == 0 {
+		return unsafe.Pointer(&zerobase)
+	}
+
+	if debug.sbrk != 0 {
+		align := uintptr(16)
+		if typ != nil {
+			align = uintptr(typ.align)
+		}
+		return persistentalloc(size, align, &memstats.other_sys)
+	}
+
+	// When using gccgo, when a cgo or SWIG function has an
+	// interface return type and the function returns a
+	// non-pointer, memory allocation occurs after syscall.Cgocall
+	// but before syscall.CgocallDone. Treat this allocation as a
+	// callback.
+	incallback := false
+	if gomcache() == nil && getg().m.ncgo > 0 {
+		exitsyscall(0)
+		incallback = true
+	}
+
+	// assistG is the G to charge for this allocation, or nil if
+	// GC is not currently active.
+	var assistG *g
+	if gcBlackenEnabled != 0 {
+		// Charge the current user G for this allocation.
+		assistG = getg()
+		if assistG.m.curg != nil {
+			assistG = assistG.m.curg
+		}
+		// Charge the allocation against the G. We'll account
+		// for internal fragmentation at the end of mallocgc.
+		assistG.gcAssistBytes -= int64(size)
+
+		if assistG.gcAssistBytes < 0 {
+			// This G is in debt. Assist the GC to correct
+			// this before allocating. This must happen
+			// before disabling preemption.
+			gcAssistAlloc(assistG)
+		}
+	}
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("malloc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("malloc during signal")
+	}
+	mp.mallocing = 1
+
+	shouldhelpgc := false
+	dataSize := size
+	c := gomcache()
+	var x unsafe.Pointer
+	noscan := typ == nil || typ.kind&kindNoPointers != 0
+	if size <= maxSmallSize {
+		if noscan && size < maxTinySize {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be noscan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (maxTinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= maxTinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+			off := c.tinyoffset
+			// Align tiny pointer for required (conservative) alignment.
+			if size&7 == 0 {
+				off = round(off, 8)
+			} else if size&3 == 0 {
+				off = round(off, 4)
+			} else if size&1 == 0 {
+				off = round(off, 2)
+			}
+			if off+size <= maxTinySize && c.tiny != 0 {
+				// The object fits into existing tiny block.
+				x = unsafe.Pointer(c.tiny + off)
+				c.tinyoffset = off + size
+				c.local_tinyallocs++
+				mp.mallocing = 0
+				releasem(mp)
+				if incallback {
+					entersyscall(0)
+				}
+				return x
+			}
+			// Allocate a new maxTinySize block.
+			span := c.alloc[tinySizeClass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, _, shouldhelpgc = c.nextFree(tinySizeClass)
+			}
+			x = unsafe.Pointer(v)
+			(*[2]uint64)(x)[0] = 0
+			(*[2]uint64)(x)[1] = 0
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if size < c.tinyoffset || c.tiny == 0 {
+				c.tiny = uintptr(x)
+				c.tinyoffset = size
+			}
+			size = maxTinySize
+		} else {
+			var sizeclass uint8
+			if size <= smallSizeMax-8 {
+				sizeclass = size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]
+			} else {
+				sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
+			}
+			size = uintptr(class_to_size[sizeclass])
+			span := c.alloc[sizeclass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, span, shouldhelpgc = c.nextFree(sizeclass)
+			}
+			x = unsafe.Pointer(v)
+			if needzero && span.needzero != 0 {
+				memclrNoHeapPointers(unsafe.Pointer(v), size)
+			}
+		}
+	} else {
+		var s *mspan
+		shouldhelpgc = true
+		systemstack(func() {
+			s = largeAlloc(size, needzero)
+		})
+		s.freeindex = 1
+		s.allocCount = 1
+		x = unsafe.Pointer(s.base())
+		size = s.elemsize
+	}
+
+	var scanSize uintptr
+	if noscan {
+		heapBitsSetTypeNoScan(uintptr(x))
+	} else {
+		heapBitsSetType(uintptr(x), size, dataSize, typ)
+		if dataSize > typ.size {
+			// Array allocation. If there are any
+			// pointers, GC has to scan to the last
+			// element.
+			if typ.ptrdata != 0 {
+				scanSize = dataSize - typ.size + typ.ptrdata
+			}
+		} else {
+			scanSize = typ.ptrdata
+		}
+		c.local_scan += scanSize
+	}
+
+	// Ensure that the stores above that initialize x to
+	// type-safe memory and set the heap bits occur before
+	// the caller can make x observable to the garbage
+	// collector. Otherwise, on weakly ordered machines,
+	// the garbage collector could follow a pointer to x,
+	// but see uninitialized memory or stale heap bits.
+	publicationBarrier()
+
+	// Allocate black during GC.
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase != _GCoff {
+		gcmarknewobject(uintptr(x), size, scanSize)
+	}
+
+	if raceenabled {
+		racemalloc(x, size)
+	}
+
+	if msanenabled {
+		msanmalloc(x, size)
+	}
+
+	mp.mallocing = 0
+	releasem(mp)
+
+	if debug.allocfreetrace != 0 {
+		tracealloc(x, size, typ)
+	}
+
+	if rate := MemProfileRate; rate > 0 {
+		if size < uintptr(rate) && int32(size) < c.next_sample {
+			c.next_sample -= int32(size)
+		} else {
+			mp := acquirem()
+			profilealloc(mp, x, size)
+			releasem(mp)
+		}
+	}
+
+	if assistG != nil {
+		// Account for internal fragmentation in the assist
+		// debt now that we know it.
+		assistG.gcAssistBytes -= int64(size - dataSize)
+	}
+
+	if shouldhelpgc && gcShouldStart(false) {
+		gcStart(gcBackgroundMode, false)
+	}
+
+	if getg().preempt {
+		checkPreempt()
+	}
+
+	if incallback {
+		entersyscall(0)
+	}
+
+	return x
+}
+
+func largeAlloc(size uintptr, needzero bool) *mspan {
+	// print("largeAlloc size=", size, "\n")
+
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	s := mheap_.alloc(npages, 0, true, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	s.limit = s.base() + size
+	heapBitsForSpan(s.base()).initSpan(s)
+	return s
+}
+
+// implementation of new builtin
+// compiler (both frontend and SSA backend) knows the signature
+// of this function
+func newobject(typ *_type) unsafe.Pointer {
+	return mallocgc(typ.size, typ, true)
+}
+
+//go:linkname reflect_unsafe_New reflect.unsafe_New
+func reflect_unsafe_New(typ *_type) unsafe.Pointer {
+	return newobject(typ)
+}
+
+// newarray allocates an array of n elements of type typ.
+func newarray(typ *_type, n int) unsafe.Pointer {
+	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
+		panic(plainError("runtime: allocation size out of range"))
+	}
+	return mallocgc(typ.size*uintptr(n), typ, true)
+}
+
+//go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
+func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
+	return newarray(typ, n)
+}
+
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	mp.mcache.next_sample = nextSample()
+	mProf_Malloc(x, size)
+}
+
+// nextSample returns the next sampling point for heap profiling.
+// It produces a random variable with a geometric distribution and
+// mean MemProfileRate. This is done by generating a uniformly
+// distributed random number and applying the cumulative distribution
+// function for an exponential.
+func nextSample() int32 {
+	if GOOS == "plan9" {
+		// Plan 9 doesn't support floating point in note handler.
+		if g := getg(); g == g.m.gsignal {
+			return nextSampleNoFP()
+		}
+	}
+
+	period := MemProfileRate
+
+	// make nextSample not overflow. Maximum possible step is
+	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+	switch {
+	case period > 0x7000000:
+		period = 0x7000000
+	case period == 0:
+		return 0
+	}
+
+	// Let m be the sample rate,
+	// the probability distribution function is m*exp(-mx), so the CDF is
+	// p = 1 - exp(-mx), so
+	// q = 1 - p == exp(-mx)
+	// log_e(q) = -mx
+	// -log_e(q)/m = x
+	// x = -log_e(q) * period
+	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	const randomBitCount = 26
+	q := fastrand()%(1<<randomBitCount) + 1
+	qlog := fastlog2(float64(q)) - randomBitCount
+	if qlog > 0 {
+		qlog = 0
+	}
+	const minusLog2 = -0.6931471805599453 // -ln(2)
+	return int32(qlog*(minusLog2*float64(period))) + 1
+}
+
+// nextSampleNoFP is similar to nextSample, but uses older,
+// simpler code to avoid floating point.
+func nextSampleNoFP() int32 {
+	// Set first allocation sample size.
+	rate := MemProfileRate
+	if rate > 0x3fffffff { // make 2*rate not overflow
+		rate = 0x3fffffff
+	}
+	if rate != 0 {
+		return int32(int(fastrand()) % (2 * rate))
+	}
+	return 0
+}
+
+type persistentAlloc struct {
+	base unsafe.Pointer
+	off  uintptr
+}
+
+var globalAlloc struct {
+	mutex
+	persistentAlloc
+}
+
+// Wrapper around sysAlloc that can allocate small chunks.
+// There is no associated free operation.
+// Intended for things like function/type/debug-related persistent data.
+// If align is 0, uses default align (currently 8).
+// The returned memory will be zeroed.
+//
+// Consider marking persistentalloc'd types go:notinheap.
+func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+	var p unsafe.Pointer
+	systemstack(func() {
+		p = persistentalloc1(size, align, sysStat)
+	})
+	return p
+}
+
+// Must run on system stack because stack growth can (re)invoke it.
+// See issue 9174.
+//go:systemstack
+func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+	const (
+		chunk    = 256 << 10
+		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
+	)
+
+	if size == 0 {
+		throw("persistentalloc: size == 0")
+	}
+	if align != 0 {
+		if align&(align-1) != 0 {
+			throw("persistentalloc: align is not a power of 2")
+		}
+		if align > _PageSize {
+			throw("persistentalloc: align is too large")
+		}
+	} else {
+		align = 8
+	}
+
+	if size >= maxBlock {
+		return sysAlloc(size, sysStat)
+	}
+
+	mp := acquirem()
+	var persistent *persistentAlloc
+	if mp != nil && mp.p != 0 {
+		persistent = &mp.p.ptr().palloc
+	} else {
+		lock(&globalAlloc.mutex)
+		persistent = &globalAlloc.persistentAlloc
+	}
+	persistent.off = round(persistent.off, align)
+	if persistent.off+size > chunk || persistent.base == nil {
+		persistent.base = sysAlloc(chunk, &memstats.other_sys)
+		if persistent.base == nil {
+			if persistent == &globalAlloc.persistentAlloc {
+				unlock(&globalAlloc.mutex)
+			}
+			throw("runtime: cannot allocate memory")
+		}
+		persistent.off = 0
+	}
+	p := add(persistent.base, persistent.off)
+	persistent.off += size
+	releasem(mp)
+	if persistent == &globalAlloc.persistentAlloc {
+		unlock(&globalAlloc.mutex)
+	}
+
+	if sysStat != &memstats.other_sys {
+		mSysStatInc(sysStat, size)
+		mSysStatDec(&memstats.other_sys, size)
+	}
+	return p
+}
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
new file mode 100644
index 0000000..3a463c8
--- /dev/null
+++ b/libgo/go/runtime/mbarrier.go
@@ -0,0 +1,418 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: write barriers.
+//
+// For the concurrent garbage collector, the Go compiler implements
+// updates to pointer-valued fields that may be in heap objects by
+// emitting calls to write barriers. This file contains the actual write barrier
+// implementation, gcmarkwb_m, and the various wrappers called by the
+// compiler to implement pointer assignment, slice assignment,
+// typed memmove, and so on.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname writebarrierptr runtime.writebarrierptr
+//go:linkname typedmemmove runtime.typedmemmove
+//go:linkname typedslicecopy runtime.typedslicecopy
+
+// gcmarkwb_m is the mark-phase write barrier, the only barrier we have.
+// The rest of this file exists only to make calls to this function.
+//
+// This is a hybrid barrier that combines a Yuasa-style deletion
+// barrier—which shades the object whose reference is being
+// overwritten—with Dijkstra insertion barrier—which shades the object
+// whose reference is being written. The insertion part of the barrier
+// is necessary while the calling goroutine's stack is grey. In
+// pseudocode, the barrier is:
+//
+//     writePointer(slot, ptr):
+//         shade(*slot)
+//         if current stack is grey:
+//             shade(ptr)
+//         *slot = ptr
+//
+// slot is the destination in Go code.
+// ptr is the value that goes into the slot in Go code.
+//
+// Shade indicates that it has seen a white pointer by adding the referent
+// to wbuf as well as marking it.
+//
+// The two shades and the condition work together to prevent a mutator
+// from hiding an object from the garbage collector:
+//
+// 1. shade(*slot) prevents a mutator from hiding an object by moving
+// the sole pointer to it from the heap to its stack. If it attempts
+// to unlink an object from the heap, this will shade it.
+//
+// 2. shade(ptr) prevents a mutator from hiding an object by moving
+// the sole pointer to it from its stack into a black object in the
+// heap. If it attempts to install the pointer into a black object,
+// this will shade it.
+//
+// 3. Once a goroutine's stack is black, the shade(ptr) becomes
+// unnecessary. shade(ptr) prevents hiding an object by moving it from
+// the stack to the heap, but this requires first having a pointer
+// hidden on the stack. Immediately after a stack is scanned, it only
+// points to shaded objects, so it's not hiding anything, and the
+// shade(*slot) prevents it from hiding any other pointers on its
+// stack.
+//
+// For a detailed description of this barrier and proof of
+// correctness, see https://github.com/golang/proposal/blob/master/design/17503-eliminate-rescan.md
+//
+//
+//
+// Dealing with memory ordering:
+//
+// Both the Yuasa and Dijkstra barriers can be made conditional on the
+// color of the object containing the slot. We chose not to make these
+// conditional because the cost of ensuring that the object holding
+// the slot doesn't concurrently change color without the mutator
+// noticing seems prohibitive.
+//
+// Consider the following example where the mutator writes into
+// a slot and then loads the slot's mark bit while the GC thread
+// writes to the slot's mark bit and then as part of scanning reads
+// the slot.
+//
+// Initially both [slot] and [slotmark] are 0 (nil)
+// Mutator thread          GC thread
+// st [slot], ptr          st [slotmark], 1
+//
+// ld r1, [slotmark]       ld r2, [slot]
+//
+// Without an expensive memory barrier between the st and the ld, the final
+// result on most HW (including 386/amd64) can be r1==r2==0. This is a classic
+// example of what can happen when loads are allowed to be reordered with older
+// stores (avoiding such reorderings lies at the heart of the classic
+// Peterson/Dekker algorithms for mutual exclusion). Rather than require memory
+// barriers, which will slow down both the mutator and the GC, we always grey
+// the ptr object regardless of the slot's color.
+//
+// Another place where we intentionally omit memory barriers is when
+// accessing mheap_.arena_used to check if a pointer points into the
+// heap. On relaxed memory machines, it's possible for a mutator to
+// extend the size of the heap by updating arena_used, allocate an
+// object from this new region, and publish a pointer to that object,
+// but for tracing running on another processor to observe the pointer
+// but use the old value of arena_used. In this case, tracing will not
+// mark the object, even though it's reachable. However, the mutator
+// is guaranteed to execute a write barrier when it publishes the
+// pointer, so it will take care of marking the object. A general
+// consequence of this is that the garbage collector may cache the
+// value of mheap_.arena_used. (See issue #9984.)
+//
+//
+// Stack writes:
+//
+// The compiler omits write barriers for writes to the current frame,
+// but if a stack pointer has been passed down the call stack, the
+// compiler will generate a write barrier for writes through that
+// pointer (because it doesn't know it's not a heap pointer).
+//
+// One might be tempted to ignore the write barrier if slot points
+// into to the stack. Don't do it! Mark termination only re-scans
+// frames that have potentially been active since the concurrent scan,
+// so it depends on write barriers to track changes to pointers in
+// stack frames that have not been active.
+//
+//
+// Global writes:
+//
+// The Go garbage collector requires write barriers when heap pointers
+// are stored in globals. Many garbage collectors ignore writes to
+// globals and instead pick up global -> heap pointers during
+// termination. This increases pause time, so we instead rely on write
+// barriers for writes to globals so that we don't have to rescan
+// global during mark termination.
+//
+//
+// Publication ordering:
+//
+// The write barrier is *pre-publication*, meaning that the write
+// barrier happens prior to the *slot = ptr write that may make ptr
+// reachable by some goroutine that currently cannot reach it.
+//
+//
+//go:nowritebarrierrec
+//go:systemstack
+func gcmarkwb_m(slot *uintptr, ptr uintptr) {
+	if writeBarrier.needed {
+		// Note: This turns bad pointer writes into bad
+		// pointer reads, which could be confusing. We avoid
+		// reading from obviously bad pointers, which should
+		// take care of the vast majority of these. We could
+		// patch this up in the signal handler, or use XCHG to
+		// combine the read and the write. Checking inheap is
+		// insufficient since we need to track changes to
+		// roots outside the heap.
+		if slot1 := uintptr(unsafe.Pointer(slot)); slot1 >= minPhysPageSize {
+			if optr := *slot; optr != 0 {
+				shade(optr)
+			}
+		}
+		// TODO: Make this conditional on the caller's stack color.
+		if ptr != 0 && inheap(ptr) {
+			shade(ptr)
+		}
+	}
+}
+
+// writebarrierptr_prewrite1 invokes a write barrier for *dst = src
+// prior to the write happening.
+//
+// Write barrier calls must not happen during critical GC and scheduler
+// related operations. In particular there are times when the GC assumes
+// that the world is stopped but scheduler related code is still being
+// executed, dealing with syscalls, dealing with putting gs on runnable
+// queues and so forth. This code cannot execute write barriers because
+// the GC might drop them on the floor. Stopping the world involves removing
+// the p associated with an m. We use the fact that m.p == nil to indicate
+// that we are in one these critical section and throw if the write is of
+// a pointer to a heap object.
+//go:nosplit
+func writebarrierptr_prewrite1(dst *uintptr, src uintptr) {
+	mp := acquirem()
+	if mp.inwb || mp.dying > 0 {
+		releasem(mp)
+		return
+	}
+	systemstack(func() {
+		if mp.p == 0 && memstats.enablegc && !mp.inwb && inheap(src) {
+			throw("writebarrierptr_prewrite1 called with mp.p == nil")
+		}
+		mp.inwb = true
+		gcmarkwb_m(dst, src)
+	})
+	mp.inwb = false
+	releasem(mp)
+}
+
+// NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer,
+// but if we do that, Go inserts a write barrier on *dst = src.
+//go:nosplit
+func writebarrierptr(dst *uintptr, src uintptr) {
+	if writeBarrier.cgo {
+		cgoCheckWriteBarrier(dst, src)
+	}
+	if !writeBarrier.needed {
+		*dst = src
+		return
+	}
+	if src != 0 && src < minPhysPageSize {
+		systemstack(func() {
+			print("runtime: writebarrierptr *", dst, " = ", hex(src), "\n")
+			throw("bad pointer in write barrier")
+		})
+	}
+	writebarrierptr_prewrite1(dst, src)
+	*dst = src
+}
+
+// writebarrierptr_prewrite is like writebarrierptr, but the store
+// will be performed by the caller after this call. The caller must
+// not allow preemption between this call and the write.
+//
+//go:nosplit
+func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
+	if writeBarrier.cgo {
+		cgoCheckWriteBarrier(dst, src)
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	if src != 0 && src < minPhysPageSize {
+		systemstack(func() { throw("bad pointer in write barrier") })
+	}
+	writebarrierptr_prewrite1(dst, src)
+}
+
+// typedmemmove copies a value of type t to dst from src.
+//go:nosplit
+func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	if typ.kind&kindNoPointers == 0 {
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.size)
+	}
+	// There's a race here: if some other goroutine can write to
+	// src, it may change some pointer in src after we've
+	// performed the write barrier but before we perform the
+	// memory copy. This safe because the write performed by that
+	// other goroutine must also be accompanied by a write
+	// barrier, so at worst we've unnecessarily greyed the old
+	// pointer that was in src.
+	memmove(dst, src, typ.size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, 0, typ.size)
+	}
+}
+
+//go:linkname reflect_typedmemmove reflect.typedmemmove
+func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	if raceenabled {
+		raceWriteObjectPC(typ, dst, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+		raceReadObjectPC(typ, src, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+	}
+	if msanenabled {
+		msanwrite(dst, typ.size)
+		msanread(src, typ.size)
+	}
+	typedmemmove(typ, dst, src)
+}
+
+// typedmemmovepartial is like typedmemmove but assumes that
+// dst and src point off bytes into the value and only copies size bytes.
+//go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
+func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
+	if writeBarrier.needed && typ.kind&kindNoPointers == 0 && size >= sys.PtrSize {
+		// Pointer-align start address for bulk barrier.
+		adst, asrc, asize := dst, src, size
+		if frag := -off & (sys.PtrSize - 1); frag != 0 {
+			adst = add(dst, frag)
+			asrc = add(src, frag)
+			asize -= frag
+		}
+		bulkBarrierPreWrite(uintptr(adst), uintptr(asrc), asize&^(sys.PtrSize-1))
+	}
+
+	memmove(dst, src, size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, off, size)
+	}
+}
+
+//go:nosplit
+func typedslicecopy(typ *_type, dst, src slice) int {
+	// TODO(rsc): If typedslicecopy becomes faster than calling
+	// typedmemmove repeatedly, consider using during func growslice.
+	n := dst.len
+	if n > src.len {
+		n = src.len
+	}
+	if n == 0 {
+		return 0
+	}
+	dstp := dst.array
+	srcp := src.array
+
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&typ))
+		pc := funcPC(slicecopy)
+		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
+		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
+	}
+	if msanenabled {
+		msanwrite(dstp, uintptr(n)*typ.size)
+		msanread(srcp, uintptr(n)*typ.size)
+	}
+
+	if writeBarrier.cgo {
+		cgoCheckSliceCopy(typ, dst, src, n)
+	}
+
+	// Note: No point in checking typ.kind&kindNoPointers here:
+	// compiler only emits calls to typedslicecopy for types with pointers,
+	// and growslice and reflect_typedslicecopy check for pointers
+	// before calling typedslicecopy.
+	if !writeBarrier.needed {
+		memmove(dstp, srcp, uintptr(n)*typ.size)
+		return n
+	}
+
+	systemstack(func() {
+		if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
+			// Overlap with src before dst.
+			// Copy backward, being careful not to move dstp/srcp
+			// out of the array they point into.
+			dstp = add(dstp, uintptr(n-1)*typ.size)
+			srcp = add(srcp, uintptr(n-1)*typ.size)
+			i := 0
+			for {
+				typedmemmove(typ, dstp, srcp)
+				if i++; i >= n {
+					break
+				}
+				dstp = add(dstp, -typ.size)
+				srcp = add(srcp, -typ.size)
+			}
+		} else {
+			// Copy forward, being careful not to move dstp/srcp
+			// out of the array they point into.
+			i := 0
+			for {
+				typedmemmove(typ, dstp, srcp)
+				if i++; i >= n {
+					break
+				}
+				dstp = add(dstp, typ.size)
+				srcp = add(srcp, typ.size)
+			}
+		}
+	})
+	return n
+}
+
+//go:linkname reflect_typedslicecopy reflect.typedslicecopy
+func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
+	if elemType.kind&kindNoPointers != 0 {
+		n := dst.len
+		if n > src.len {
+			n = src.len
+		}
+		if n == 0 {
+			return 0
+		}
+
+		size := uintptr(n) * elemType.size
+		if raceenabled {
+			callerpc := getcallerpc(unsafe.Pointer(&elemType))
+			pc := funcPC(reflect_typedslicecopy)
+			racewriterangepc(dst.array, size, callerpc, pc)
+			racereadrangepc(src.array, size, callerpc, pc)
+		}
+		if msanenabled {
+			msanwrite(dst.array, size)
+			msanread(src.array, size)
+		}
+
+		memmove(dst.array, src.array, size)
+		return n
+	}
+	return typedslicecopy(elemType, dst, src)
+}
+
+// typedmemclr clears the typed memory at ptr with type typ. The
+// memory at ptr must already be initialized (and hence in type-safe
+// state). If the memory is being initialized for the first time, see
+// memclrNoHeapPointers.
+//
+// If the caller knows that typ has pointers, it can alternatively
+// call memclrHasPointers.
+//
+//go:nosplit
+func typedmemclr(typ *_type, ptr unsafe.Pointer) {
+	if typ.kind&kindNoPointers == 0 {
+		bulkBarrierPreWrite(uintptr(ptr), 0, typ.size)
+	}
+	memclrNoHeapPointers(ptr, typ.size)
+}
+
+// memclrHasPointers clears n bytes of typed memory starting at ptr.
+// The caller must ensure that the type of the object at ptr has
+// pointers, usually by checking typ.kind&kindNoPointers. However, ptr
+// does not have to point to the start of the allocation.
+//
+//go:nosplit
+func memclrHasPointers(ptr unsafe.Pointer, n uintptr) {
+	bulkBarrierPreWrite(uintptr(ptr), 0, n)
+	memclrNoHeapPointers(ptr, n)
+}
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
new file mode 100644
index 0000000..2b00493
--- /dev/null
+++ b/libgo/go/runtime/mbitmap.go
@@ -0,0 +1,1874 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: type and heap bitmaps.
+//
+// Stack, data, and bss bitmaps
+//
+// Stack frames and global variables in the data and bss sections are described
+// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
+// to be visited during GC. The bits in each byte are consumed starting with
+// the low bit: 1<<0, 1<<1, and so on.
+//
+// Heap bitmap
+//
+// The allocated heap comes from a subset of the memory in the range [start, used),
+// where start == mheap_.arena_start and used == mheap_.arena_used.
+// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
+// stored in bytes indexed backward in memory from start.
+// That is, the byte at address start-1 holds the 2-bit entries for the four words
+// start through start+3*ptrSize, the byte at start-2 holds the entries for
+// start+4*ptrSize through start+7*ptrSize, and so on.
+//
+// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
+// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
+// The meaning of the high bit depends on the position of the word being described
+// in its allocated object. In all words *except* the second word, the
+// high bit indicates that the object is still being described. In
+// these words, if a bit pair with a high bit 0 is encountered, the
+// low bit can also be assumed to be 0, and the object description is
+// over. This 00 is called the ``dead'' encoding: it signals that the
+// rest of the words in the object are uninteresting to the garbage
+// collector.
+//
+// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+//
+// The 2-bit entries are split when written into the byte, so that the top half
+// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
+// bits.
+// This form allows a copy from the 1-bit to the 4-bit form to keep the
+// pointer bits contiguous, instead of having to space them out.
+//
+// The code makes use of the fact that the zero value for a heap bitmap
+// has no live pointer bit set and is (depending on position), not used,
+// not checkmarked, and is the dead encoding.
+// These properties must be preserved when modifying the encoding.
+//
+// Checkmarks
+//
+// In a concurrent garbage collector, one worries about failing to mark
+// a live object due to mutations without write barriers or bugs in the
+// collector implementation. As a sanity check, the GC has a 'checkmark'
+// mode that retraverses the object graph with the world stopped, to make
+// sure that everything that should be marked is marked.
+// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
+// for the second word of the object holds the checkmark bit.
+// When not in checkmark mode, this bit is set to 1.
+//
+// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
+// means every allocated object has two words, so there is room for the
+// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
+// just one word, so the second bit pair is not available for encoding the
+// checkmark. However, because non-pointer allocations are combined
+// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
+// must be a pointer, so the type bit in the first word is not actually needed.
+// It is still used in general, except in checkmark the type bit is repurposed
+// as the checkmark bit and then reinitialized (to 1) as the type bit when
+// finished.
+//
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	bitPointer = 1 << 0
+	bitScan    = 1 << 4
+
+	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
+	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
+
+	// all scan/pointer bits in a byte
+	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
+)
+
+// addb returns the byte pointer p+n.
+//go:nowritebarrier
+//go:nosplit
+func addb(p *byte, n uintptr) *byte {
+	// Note: wrote out full expression instead of calling add(p, n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
+}
+
+// subtractb returns the byte pointer p-n.
+// subtractb is typically used when traversing the pointer tables referred to by hbits
+// which are arranged in reverse order.
+//go:nowritebarrier
+//go:nosplit
+func subtractb(p *byte, n uintptr) *byte {
+	// Note: wrote out full expression instead of calling add(p, -n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
+}
+
+// add1 returns the byte pointer p+1.
+//go:nowritebarrier
+//go:nosplit
+func add1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling addb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
+}
+
+// subtract1 returns the byte pointer p-1.
+// subtract1 is typically used when traversing the pointer tables referred to by hbits
+// which are arranged in reverse order.
+//go:nowritebarrier
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func subtract1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling subtractb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
+}
+
+// mHeap_MapBits is called each time arena_used is extended.
+// It maps any additional bitmap memory needed for the new arena memory.
+// It must be called with the expected new value of arena_used,
+// *before* h.arena_used has been updated.
+// Waiting to update arena_used until after the memory has been mapped
+// avoids faults when other threads try access the bitmap immediately
+// after observing the change to arena_used.
+//
+//go:nowritebarrier
+func (h *mheap) mapBits(arena_used uintptr) {
+	// Caller has added extra mappings to the arena.
+	// Add extra mappings of bitmap words as needed.
+	// We allocate extra bitmap pieces in chunks of bitmapChunk.
+	const bitmapChunk = 8192
+
+	n := (arena_used - mheap_.arena_start) / heapBitmapScale
+	n = round(n, bitmapChunk)
+	n = round(n, physPageSize)
+	if h.bitmap_mapped >= n {
+		return
+	}
+
+	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
+	h.bitmap_mapped = n
+}
+
+// heapBits provides access to the bitmap bits for a single heap word.
+// The methods on heapBits take value receivers so that the compiler
+// can more easily inline calls to those methods and registerize the
+// struct fields independently.
+type heapBits struct {
+	bitp  *uint8
+	shift uint32
+}
+
+// markBits provides access to the mark bit for an object in the heap.
+// bytep points to the byte holding the mark bit.
+// mask is a byte with a single bit set that can be &ed with *bytep
+// to see if the bit has been set.
+// *m.byte&m.mask != 0 indicates the mark bit is set.
+// index can be used along with span information to generate
+// the address of the object in the heap.
+// We maintain one set of mark bits for allocation and one for
+// marking purposes.
+type markBits struct {
+	bytep *uint8
+	mask  uint8
+	index uintptr
+}
+
+//go:nosplit
+func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
+	whichByte := allocBitIndex / 8
+	whichBit := allocBitIndex % 8
+	bytePtr := addb(s.allocBits, whichByte)
+	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
+}
+
+// refillaCache takes 8 bytes s.allocBits starting at whichByte
+// and negates them so that ctz (count trailing zeros) instructions
+// can be used. It then places these 8 bytes into the cached 64 bit
+// s.allocCache.
+func (s *mspan) refillAllocCache(whichByte uintptr) {
+	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
+	aCache := uint64(0)
+	aCache |= uint64(bytes[0])
+	aCache |= uint64(bytes[1]) << (1 * 8)
+	aCache |= uint64(bytes[2]) << (2 * 8)
+	aCache |= uint64(bytes[3]) << (3 * 8)
+	aCache |= uint64(bytes[4]) << (4 * 8)
+	aCache |= uint64(bytes[5]) << (5 * 8)
+	aCache |= uint64(bytes[6]) << (6 * 8)
+	aCache |= uint64(bytes[7]) << (7 * 8)
+	s.allocCache = ^aCache
+}
+
+// nextFreeIndex returns the index of the next free object in s at
+// or after s.freeindex.
+// There are hardware instructions that can be used to make this
+// faster if profiling warrants it.
+func (s *mspan) nextFreeIndex() uintptr {
+	sfreeindex := s.freeindex
+	snelems := s.nelems
+	if sfreeindex == snelems {
+		return sfreeindex
+	}
+	if sfreeindex > snelems {
+		throw("s.freeindex > s.nelems")
+	}
+
+	aCache := s.allocCache
+
+	bitIndex := sys.Ctz64(aCache)
+	for bitIndex == 64 {
+		// Move index to start of next cached bits.
+		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
+		if sfreeindex >= snelems {
+			s.freeindex = snelems
+			return snelems
+		}
+		whichByte := sfreeindex / 8
+		// Refill s.allocCache with the next 64 alloc bits.
+		s.refillAllocCache(whichByte)
+		aCache = s.allocCache
+		bitIndex = sys.Ctz64(aCache)
+		// nothing available in cached bits
+		// grab the next 8 bytes and try again.
+	}
+	result := sfreeindex + uintptr(bitIndex)
+	if result >= snelems {
+		s.freeindex = snelems
+		return snelems
+	}
+
+	s.allocCache >>= (bitIndex + 1)
+	sfreeindex = result + 1
+
+	if sfreeindex%64 == 0 && sfreeindex != snelems {
+		// We just incremented s.freeindex so it isn't 0.
+		// As each 1 in s.allocCache was encountered and used for allocation
+		// it was shifted away. At this point s.allocCache contains all 0s.
+		// Refill s.allocCache so that it corresponds
+		// to the bits at s.allocBits starting at s.freeindex.
+		whichByte := sfreeindex / 8
+		s.refillAllocCache(whichByte)
+	}
+	s.freeindex = sfreeindex
+	return result
+}
+
+// isFree returns whether the index'th object in s is unallocated.
+func (s *mspan) isFree(index uintptr) bool {
+	if index < s.freeindex {
+		return false
+	}
+	whichByte := index / 8
+	whichBit := index % 8
+	byteVal := *addb(s.allocBits, whichByte)
+	return byteVal&uint8(1<<whichBit) == 0
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	byteOffset := p - s.base()
+	if byteOffset == 0 {
+		return 0
+	}
+	if s.baseMask != 0 {
+		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
+		return byteOffset >> s.divShift
+	}
+	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
+}
+
+func markBitsForAddr(p uintptr) markBits {
+	s := spanOf(p)
+	objIndex := s.objIndex(p)
+	return s.markBitsForIndex(objIndex)
+}
+
+func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
+	whichByte := objIndex / 8
+	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
+	bytePtr := addb(s.gcmarkBits, whichByte)
+	return markBits{bytePtr, bitMask, objIndex}
+}
+
+func (s *mspan) markBitsForBase() markBits {
+	return markBits{s.gcmarkBits, uint8(1), 0}
+}
+
+// isMarked reports whether mark bit m is set.
+func (m markBits) isMarked() bool {
+	return *m.bytep&m.mask != 0
+}
+
+// setMarked sets the marked bit in the markbits, atomically. Some compilers
+// are not able to inline atomic.Or8 function so if it appears as a hot spot consider
+// inlining it manually.
+func (m markBits) setMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.Or8(m.bytep, m.mask)
+}
+
+// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
+func (m markBits) setMarkedNonAtomic() {
+	*m.bytep |= m.mask
+}
+
+// clearMarked clears the marked bit in the markbits, atomically.
+func (m markBits) clearMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.And8(m.bytep, ^m.mask)
+}
+
+// clearMarkedNonAtomic clears the marked bit non-atomically.
+func (m markBits) clearMarkedNonAtomic() {
+	*m.bytep ^= m.mask
+}
+
+// markBitsForSpan returns the markBits for the span base address base.
+func markBitsForSpan(base uintptr) (mbits markBits) {
+	if base < mheap_.arena_start || base >= mheap_.arena_used {
+		throw("markBitsForSpan: base out of range")
+	}
+	mbits = markBitsForAddr(base)
+	if mbits.mask != 1 {
+		throw("markBitsForSpan: unaligned start")
+	}
+	return mbits
+}
+
+// advance advances the markBits to the next object in the span.
+func (m *markBits) advance() {
+	if m.mask == 1<<7 {
+		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
+		m.mask = 1
+	} else {
+		m.mask = m.mask << 1
+	}
+	m.index++
+}
+
+// heapBitsForAddr returns the heapBits for the address addr.
+// The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func heapBitsForAddr(addr uintptr) heapBits {
+	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
+	off := (addr - mheap_.arena_start) / sys.PtrSize
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
+}
+
+// heapBitsForSpan returns the heapBits for the span base address base.
+func heapBitsForSpan(base uintptr) (hbits heapBits) {
+	if base < mheap_.arena_start || base >= mheap_.arena_used {
+		throw("heapBitsForSpan: base out of range")
+	}
+	return heapBitsForAddr(base)
+}
+
+// heapBitsForObject returns the base address for the heap object
+// containing the address p, the heapBits for base,
+// the object's span, and of the index of the object in s.
+// If p does not point into a heap object,
+// return base == 0
+// otherwise return the base of the object.
+//
+// For gccgo, the forStack parameter is true if the value came from the stack.
+// The stack is collected conservatively and may contain invalid pointers.
+//
+// refBase and refOff optionally give the base address of the object
+// in which the pointer p was found and the byte offset at which it
+// was found. These are used for error reporting.
+func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
+	arenaStart := mheap_.arena_start
+	if p < arenaStart || p >= mheap_.arena_used {
+		return
+	}
+	off := p - arenaStart
+	idx := off >> _PageShift
+	// p points into the heap, but possibly to the middle of an object.
+	// Consult the span table to find the block beginning.
+	s = mheap_.spans[idx]
+	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
+		if s == nil || s.state == _MSpanStack || forStack {
+			// If s is nil, the virtual address has never been part of the heap.
+			// This pointer may be to some mmap'd region, so we allow it.
+			// Pointers into stacks are also ok, the runtime manages these explicitly.
+			return
+		}
+
+		// The following ensures that we are rigorous about what data
+		// structures hold valid pointers.
+		if debug.invalidptr != 0 {
+			// Typically this indicates an incorrect use
+			// of unsafe or cgo to store a bad pointer in
+			// the Go heap. It may also indicate a runtime
+			// bug.
+			//
+			// TODO(austin): We could be more aggressive
+			// and detect pointers to unallocated objects
+			// in allocated spans.
+			printlock()
+			print("runtime: pointer ", hex(p))
+			if s.state != mSpanInUse {
+				print(" to unallocated span")
+			} else {
+				print(" to unused region of span")
+			}
+			print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
+			if refBase != 0 {
+				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
+				gcDumpObject("object", refBase, refOff)
+			}
+			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+		}
+		return
+	}
+
+	if forStack {
+		// A span can be entered in mheap_.spans, and be set
+		// to mSpanInUse, before it is fully initialized.
+		// All we need in practice is allocBits and gcmarkBits,
+		// so make sure they are set.
+		if s.allocBits == nil || s.gcmarkBits == nil {
+			return
+		}
+	}
+
+	// If this span holds object of a power of 2 size, just mask off the bits to
+	// the interior of the object. Otherwise use the size to get the base.
+	if s.baseMask != 0 {
+		// optimize for power of 2 sized objects.
+		base = s.base()
+		base = base + (p-base)&uintptr(s.baseMask)
+		objIndex = (base - s.base()) >> s.divShift
+		// base = p & s.baseMask is faster for small spans,
+		// but doesn't work for large spans.
+		// Overall, it's faster to use the more general computation above.
+	} else {
+		base = s.base()
+		if p-base >= s.elemsize {
+			// n := (p - base) / s.elemsize, using division by multiplication
+			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
+			base += objIndex * s.elemsize
+		}
+	}
+	// Now that we know the actual base, compute heapBits to return to caller.
+	hbits = heapBitsForAddr(base)
+	return
+}
+
+// prefetch the bits.
+func (h heapBits) prefetch() {
+	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
+}
+
+// next returns the heapBits describing the next pointer-sized word in memory.
+// That is, if h describes address p, h.next() describes p+ptrSize.
+// Note that next does not modify h. The caller must record the result.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) next() heapBits {
+	if h.shift < 3*heapBitsShift {
+		return heapBits{h.bitp, h.shift + heapBitsShift}
+	}
+	return heapBits{subtract1(h.bitp), 0}
+}
+
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsShift
+	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
+}
+
+// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+func (h heapBits) bits() uint32 {
+	// The (shift & 31) eliminates a test and conditional branch
+	// from the generated code.
+	return uint32(*h.bitp) >> (h.shift & 31)
+}
+
+// morePointers returns true if this word and all remaining words in this object
+// are scalars.
+// h must not describe the second word of the object.
+func (h heapBits) morePointers() bool {
+	return h.bits()&bitScan != 0
+}
+
+// isPointer reports whether the heap bits describe a pointer word.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) isPointer() bool {
+	return h.bits()&bitPointer != 0
+}
+
+// hasPointers reports whether the given object has any pointers.
+// It must be told how large the object at h is for efficiency.
+// h must describe the initial word of the object.
+func (h heapBits) hasPointers(size uintptr) bool {
+	if size == sys.PtrSize { // 1-word objects are always pointers
+		return true
+	}
+	return (*h.bitp>>h.shift)&bitScan != 0
+}
+
+// isCheckmarked reports whether the heap bits have the checkmarked bit set.
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) isCheckmarked(size uintptr) bool {
+	if size == sys.PtrSize {
+		return (*h.bitp>>h.shift)&bitPointer != 0
+	}
+	// All multiword objects are 2-word aligned,
+	// so we know that the initial word's 2-bit pair
+	// and the second word's 2-bit pair are in the
+	// same heap bitmap byte, *h.bitp.
+	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
+}
+
+// setCheckmarked sets the checkmarked bit.
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) setCheckmarked(size uintptr) {
+	if size == sys.PtrSize {
+		atomic.Or8(h.bitp, bitPointer<<h.shift)
+		return
+	}
+	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
+}
+
+// bulkBarrierPreWrite executes writebarrierptr_prewrite1
+// for every pointer slot in the memory range [src, src+size),
+// using pointer/scalar information from [dst, dst+size).
+// This executes the write barriers necessary before a memmove.
+// src, dst, and size must be pointer-aligned.
+// The range [dst, dst+size) must lie within a single object.
+//
+// As a special case, src == 0 indicates that this is being used for a
+// memclr. bulkBarrierPreWrite will pass 0 for the src of each write
+// barrier.
+//
+// Callers should call bulkBarrierPreWrite immediately before
+// calling memmove(dst, src, size). This function is marked nosplit
+// to avoid being preempted; the GC must not stop the goroutine
+// between the memmove and the execution of the barriers.
+// The caller is also responsible for cgo pointer checks if this
+// may be writing Go pointers into non-Go memory.
+//
+// The pointer bitmap is not maintained for allocations containing
+// no pointers at all; any caller of bulkBarrierPreWrite must first
+// make sure the underlying allocation contains pointers, usually
+// by checking typ.kind&kindNoPointers.
+//
+//go:nosplit
+func bulkBarrierPreWrite(dst, src, size uintptr) {
+	if (dst|src|size)&(sys.PtrSize-1) != 0 {
+		throw("bulkBarrierPreWrite: unaligned arguments")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	if !inheap(dst) {
+		// If dst is a global, use the data or BSS bitmaps to
+		// execute write barriers.
+		roots := gcRoots
+		for roots != nil {
+			for i := 0; i < roots.count; i++ {
+				pr := roots.roots[i]
+				addr := uintptr(pr.decl)
+				if addr <= dst && dst < addr+pr.size {
+					if dst < addr+pr.ptrdata {
+						bulkBarrierBitmap(dst, src, size, dst-addr, pr.gcdata)
+					}
+					return
+				}
+			}
+			roots = roots.next
+		}
+		return
+	}
+
+	h := heapBitsForAddr(dst)
+	if src == 0 {
+		for i := uintptr(0); i < size; i += sys.PtrSize {
+			if h.isPointer() {
+				dstx := (*uintptr)(unsafe.Pointer(dst + i))
+				writebarrierptr_prewrite1(dstx, 0)
+			}
+			h = h.next()
+		}
+	} else {
+		for i := uintptr(0); i < size; i += sys.PtrSize {
+			if h.isPointer() {
+				dstx := (*uintptr)(unsafe.Pointer(dst + i))
+				srcx := (*uintptr)(unsafe.Pointer(src + i))
+				writebarrierptr_prewrite1(dstx, *srcx)
+			}
+			h = h.next()
+		}
+	}
+}
+
+// bulkBarrierBitmap executes write barriers for copying from [src,
+// src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
+// assumed to start maskOffset bytes into the data covered by the
+// bitmap in bits (which may not be a multiple of 8).
+//
+// This is used by bulkBarrierPreWrite for writes to data and BSS.
+//
+//go:nosplit
+func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
+	word := maskOffset / sys.PtrSize
+	bits = addb(bits, word/8)
+	mask := uint8(1) << (word % 8)
+
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if mask == 0 {
+			bits = addb(bits, 1)
+			if *bits == 0 {
+				// Skip 8 words.
+				i += 7 * sys.PtrSize
+				continue
+			}
+			mask = 1
+		}
+		if *bits&mask != 0 {
+			dstx := (*uintptr)(unsafe.Pointer(dst + i))
+			if src == 0 {
+				writebarrierptr_prewrite1(dstx, 0)
+			} else {
+				srcx := (*uintptr)(unsafe.Pointer(src + i))
+				writebarrierptr_prewrite1(dstx, *srcx)
+			}
+		}
+		mask <<= 1
+	}
+}
+
+// typeBitsBulkBarrier executes writebarrierptr_prewrite for every
+// pointer that would be copied from [src, src+size) to [dst,
+// dst+size) by a memmove using the type bitmap to locate those
+// pointer slots.
+//
+// The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
+// dst, src, and size must be pointer-aligned.
+// The type typ must have a plain bitmap, not a GC program.
+// The only use of this function is in channel sends, and the
+// 64 kB channel element limit takes care of this for us.
+//
+// Must not be preempted because it typically runs right before memmove,
+// and the GC must observe them as an atomic action.
+//
+//go:nosplit
+func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
+	if typ == nil {
+		throw("runtime: typeBitsBulkBarrier without type")
+	}
+	if typ.size != size {
+		println("runtime: typeBitsBulkBarrier with type ", *typ.string, " of size ", typ.size, " but memory size", size)
+		throw("runtime: invalid typeBitsBulkBarrier")
+	}
+	if typ.kind&kindGCProg != 0 {
+		println("runtime: typeBitsBulkBarrier with type ", *typ.string, " with GC prog")
+		throw("runtime: invalid typeBitsBulkBarrier")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	ptrmask := typ.gcdata
+	var bits uint32
+	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
+		if i&(sys.PtrSize*8-1) == 0 {
+			bits = uint32(*ptrmask)
+			ptrmask = addb(ptrmask, 1)
+		} else {
+			bits = bits >> 1
+		}
+		if bits&1 != 0 {
+			dstx := (*uintptr)(unsafe.Pointer(dst + i))
+			srcx := (*uintptr)(unsafe.Pointer(src + i))
+			writebarrierptr_prewrite(dstx, *srcx)
+		}
+	}
+}
+
+// The methods operating on spans all require that h has been returned
+// by heapBitsForSpan and that size, n, total are the span layout description
+// returned by the mspan's layout method.
+// If total > size*n, it means that there is extra leftover memory in the span,
+// usually due to rounding.
+//
+// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
+
+// initSpan initializes the heap bitmap for a span.
+// It clears all checkmark bits.
+// If this is a span of pointer-sized objects, it initializes all
+// words to pointer/scan.
+// Otherwise, it initializes all words to scalar/dead.
+func (h heapBits) initSpan(s *mspan) {
+	size, n, total := s.layout()
+
+	// Init the markbit structures
+	s.freeindex = 0
+	s.allocCache = ^uint64(0) // all 1s indicating all free.
+	s.nelems = n
+	s.allocBits = nil
+	s.gcmarkBits = nil
+	s.gcmarkBits = newMarkBits(s.nelems)
+	s.allocBits = newAllocBits(s.nelems)
+
+	// Clear bits corresponding to objects.
+	if total%heapBitmapScale != 0 {
+		throw("initSpan: unaligned length")
+	}
+	nbyte := total / heapBitmapScale
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		end := h.bitp
+		bitp := subtractb(end, nbyte-1)
+		for {
+			*bitp = bitPointerAll | bitScanAll
+			if bitp == end {
+				break
+			}
+			bitp = add1(bitp)
+		}
+		return
+	}
+	memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
+}
+
+// initCheckmarkSpan initializes a span for being checkmarked.
+// It clears the checkmark bits, which are set to 1 in normal operation.
+func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
+		// Only possible on 64-bit system, since minimum size is 8.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
+		bitp := h.bitp
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp &^= bitPointerAll
+			bitp = subtract1(bitp)
+		}
+		return
+	}
+	for i := uintptr(0); i < n; i++ {
+		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
+		h = h.forward(size / sys.PtrSize)
+	}
+}
+
+// clearCheckmarkSpan undoes all the checkmarking in a span.
+// The actual checkmark bits are ignored, so the only work to do
+// is to fix the pointer bits. (Pointer bits are ignored by scanobject
+// but consulted by typedmemmove.)
+func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
+		// Only possible on 64-bit system, since minimum size is 8.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
+		bitp := h.bitp
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp |= bitPointerAll
+			bitp = subtract1(bitp)
+		}
+	}
+}
+
+// oneBitCount is indexed by byte and produces the
+// number of 1 bits in that byte. For example 128 has 1 bit set
+// and oneBitCount[128] will holds 1.
+var oneBitCount = [256]uint8{
+	0, 1, 1, 2, 1, 2, 2, 3,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	5, 6, 6, 7, 6, 7, 7, 8}
+
+// countFree runs through the mark bits in a span and counts the number of free objects
+// in the span.
+// TODO:(rlh) Use popcount intrinsic.
+func (s *mspan) countFree() int {
+	count := 0
+	maxIndex := s.nelems / 8
+	for i := uintptr(0); i < maxIndex; i++ {
+		mrkBits := *addb(s.gcmarkBits, i)
+		count += int(oneBitCount[mrkBits])
+	}
+	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
+		mrkBits := *addb(s.gcmarkBits, maxIndex)
+		mask := uint8((1 << bitsInLastByte) - 1)
+		bits := mrkBits & mask
+		count += int(oneBitCount[bits])
+	}
+	return int(s.nelems) - count
+}
+
+// heapBitsSetType records that the new allocation [x, x+size)
+// holds in [x, x+dataSize) one or more values of type typ.
+// (The number of values is given by dataSize / typ.size.)
+// If dataSize < size, the fragment [x+dataSize, x+size) is
+// recorded as non-pointer data.
+// It is known that the type has pointers somewhere;
+// malloc does not call heapBitsSetType when there are no pointers,
+// because all free objects are marked as noscan during
+// heapBitsSweepSpan.
+//
+// There can only be one allocation from a given span active at a time,
+// and the bitmap for a span always falls on byte boundaries,
+// so there are no write-write races for access to the heap bitmap.
+// Hence, heapBitsSetType can access the bitmap without atomics.
+//
+// There can be read-write races between heapBitsSetType and things
+// that read the heap bitmap like scanobject. However, since
+// heapBitsSetType is only used for objects that have not yet been
+// made reachable, readers will ignore bits being modified by this
+// function. This does mean this function cannot transiently modify
+// bits that belong to neighboring objects. Also, on weakly-ordered
+// machines, callers must execute a store/store (publication) barrier
+// between calling this function and making the object reachable.
+func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
+	const doubleCheck = false // slow but helpful; enable to test modifications to this code
+
+	// dataSize is always size rounded up to the next malloc size class,
+	// except in the case of allocating a defer block, in which case
+	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
+	// arbitrarily larger.
+	//
+	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
+	// assume that dataSize == size without checking it explicitly.
+
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		// It's one word and it has pointers, it must be a pointer.
+		// Since all allocated one-word objects are pointers
+		// (non-pointers are aggregated into tinySize allocations),
+		// initSpan sets the pointer bits for us. Nothing to do here.
+		if doubleCheck {
+			h := heapBitsForAddr(x)
+			if !h.isPointer() {
+				throw("heapBitsSetType: pointer bit missing")
+			}
+			if !h.morePointers() {
+				throw("heapBitsSetType: scan bit missing")
+			}
+		}
+		return
+	}
+
+	h := heapBitsForAddr(x)
+	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
+
+	// Heap bitmap bits for 2-word object are only 4 bits,
+	// so also shared with objects next to it.
+	// This is called out as a special case primarily for 32-bit systems,
+	// so that on 32-bit systems the code below can assume all objects
+	// are 4-word aligned (because they're all 16-byte aligned).
+	if size == 2*sys.PtrSize {
+		if typ.size == sys.PtrSize {
+			// We're allocating a block big enough to hold two pointers.
+			// On 64-bit, that means the actual object must be two pointers,
+			// or else we'd have used the one-pointer-sized block.
+			// On 32-bit, however, this is the 8-byte block, the smallest one.
+			// So it could be that we're allocating one pointer and this was
+			// just the smallest block available. Distinguish by checking dataSize.
+			// (In general the number of instances of typ being allocated is
+			// dataSize/typ.size.)
+			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
+				// 1 pointer object. On 32-bit machines clear the bit for the
+				// unused second word.
+				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+				*h.bitp |= (bitPointer | bitScan) << h.shift
+			} else {
+				// 2-element slice of pointer.
+				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
+			}
+			return
+		}
+		// Otherwise typ.size must be 2*sys.PtrSize,
+		// and typ.kind&kindGCProg == 0.
+		if doubleCheck {
+			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
+				throw("heapBitsSetType")
+			}
+		}
+		b := uint32(*ptrmask)
+		hb := (b & 3) | bitScan
+		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
+		// 110011 is shifted h.shift and complemented.
+		// This clears out the bits that are about to be
+		// ored into *h.hbitp in the next instructions.
+		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+		*h.bitp |= uint8(hb << h.shift)
+		return
+	}
+
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
+	// The basic approach is to use a single uintptr as a bit buffer,
+	// alternating between reloading the buffer and writing bitmap bytes.
+	// In general, one load can supply two bitmap byte writes.
+	// This is a lot of lines of code, but it compiles into relatively few
+	// machine instructions.
+
+	var (
+		// Ptrmask input.
+		p     *byte   // last ptrmask byte read
+		b     uintptr // ptrmask bits already loaded
+		nb    uintptr // number of bits in b at next read
+		endp  *byte   // final ptrmask byte to read (then repeat)
+		endnb uintptr // number of valid bits in *endp
+		pbits uintptr // alternate source of bits
+
+		// Heap bitmap output.
+		w     uintptr // words processed
+		nw    uintptr // number of words to process
+		hbitp *byte   // next heap bitmap byte to write
+		hb    uintptr // bits being prepared for *hbitp
+	)
+
+	hbitp = h.bitp
+
+	// Handle GC program. Delayed until this part of the code
+	// so that we can use the same double-checking mechanism
+	// as the 1-bit case. Nothing above could have encountered
+	// GC programs: the cases were all too small.
+	if typ.kind&kindGCProg != 0 {
+		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
+		if doubleCheck {
+			// Double-check the heap bits written by GC program
+			// by running the GC program to create a 1-bit pointer mask
+			// and then jumping to the double-check code below.
+			// This doesn't catch bugs shared between the 1-bit and 4-bit
+			// GC program execution, but it does catch mistakes specific
+			// to just one of those and bugs in heapBitsSetTypeGCProg's
+			// implementation of arrays.
+			lock(&debugPtrmask.lock)
+			if debugPtrmask.data == nil {
+				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
+			}
+			ptrmask = debugPtrmask.data
+			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
+			goto Phase4
+		}
+		return
+	}
+
+	// Note about sizes:
+	//
+	// typ.size is the number of words in the object,
+	// and typ.ptrdata is the number of words in the prefix
+	// of the object that contains pointers. That is, the final
+	// typ.size - typ.ptrdata words contain no pointers.
+	// This allows optimization of a common pattern where
+	// an object has a small header followed by a large scalar
+	// buffer. If we know the pointers are over, we don't have
+	// to scan the buffer's heap bitmap at all.
+	// The 1-bit ptrmasks are sized to contain only bits for
+	// the typ.ptrdata prefix, zero padded out to a full byte
+	// of bitmap. This code sets nw (below) so that heap bitmap
+	// bits are only written for the typ.ptrdata prefix; if there is
+	// more room in the allocated object, the next heap bitmap
+	// entry is a 00, indicating that there are no more pointers
+	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	//
+	// Replicated copies are not as nice: if there is an array of
+	// objects with scalar tails, all but the last tail does have to
+	// be initialized, because there is no way to say "skip forward".
+	// However, because of the possibility of a repeated type with
+	// size not a multiple of 4 pointers (one heap bitmap byte),
+	// the code already must handle the last ptrmask byte specially
+	// by treating it as containing only the bits for endnb pointers,
+	// where endnb <= 4. We represent large scalar tails that must
+	// be expanded in the replication by setting endnb larger than 4.
+	// This will have the effect of reading many bits out of b,
+	// but once the real bits are shifted out, b will supply as many
+	// zero bits as we try to read, which is exactly what we need.
+
+	p = ptrmask
+	if typ.size < dataSize {
+		// Filling in bits for an array of typ.
+		// Set up for repetition of ptrmask during main loop.
+		// Note that ptrmask describes only a prefix of
+		const maxBits = sys.PtrSize*8 - 7
+		if typ.ptrdata/sys.PtrSize <= maxBits {
+			// Entire ptrmask fits in uintptr with room for a byte fragment.
+			// Load into pbits and never read from ptrmask again.
+			// This is especially important when the ptrmask has
+			// fewer than 8 bits in it; otherwise the reload in the middle
+			// of the Phase 2 loop would itself need to loop to gather
+			// at least 8 bits.
+
+			// Accumulate ptrmask into b.
+			// ptrmask is sized to describe only typ.ptrdata, but we record
+			// it as describing typ.size bytes, since all the high bits are zero.
+			nb = typ.ptrdata / sys.PtrSize
+			for i := uintptr(0); i < nb; i += 8 {
+				b |= uintptr(*p) << i
+				p = add1(p)
+			}
+			nb = typ.size / sys.PtrSize
+
+			// Replicate ptrmask to fill entire pbits uintptr.
+			// Doubling and truncating is fewer steps than
+			// iterating by nb each time. (nb could be 1.)
+			// Since we loaded typ.ptrdata/sys.PtrSize bits
+			// but are pretending to have typ.size/sys.PtrSize,
+			// there might be no replication necessary/possible.
+			pbits = b
+			endnb = nb
+			if nb+nb <= maxBits {
+				for endnb <= sys.PtrSize*8 {
+					pbits |= pbits << endnb
+					endnb += endnb
+				}
+				// Truncate to a multiple of original ptrmask.
+				endnb = maxBits / nb * nb
+				pbits &= 1<<endnb - 1
+				b = pbits
+				nb = endnb
+			}
+
+			// Clear p and endp as sentinel for using pbits.
+			// Checked during Phase 2 loop.
+			p = nil
+			endp = nil
+		} else {
+			// Ptrmask is larger. Read it multiple times.
+			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
+			endp = addb(ptrmask, n)
+			endnb = typ.size/sys.PtrSize - n*8
+		}
+	}
+	if p != nil {
+		b = uintptr(*p)
+		p = add1(p)
+		nb = 8
+	}
+
+	if typ.size == dataSize {
+		// Single entry: can stop once we reach the non-pointer data.
+		nw = typ.ptrdata / sys.PtrSize
+	} else {
+		// Repeated instances of typ in an array.
+		// Have to process first N-1 entries in full, but can stop
+		// once we reach the non-pointer data in the final entry.
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
+	}
+	if nw == 0 {
+		// No pointers! Caller was supposed to check.
+		println("runtime: invalid type ", *typ.string)
+		throw("heapBitsSetType: called with non-pointer type")
+		return
+	}
+	if nw < 2 {
+		// Must write at least 2 words, because the "no scan"
+		// encoding doesn't take effect until the third word.
+		nw = 2
+	}
+
+	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
+	// The leading byte is special because it contains the bits for word 1,
+	// which does not have the scan bit set.
+	// The leading half-byte is special because it's a half a byte,
+	// so we have to be careful with the bits already there.
+	switch {
+	default:
+		throw("heapBitsSetType: unexpected shift")
+
+	case h.shift == 0:
+		// Ptrmask and heap bitmap are aligned.
+		// Handle first byte of bitmap specially.
+		//
+		// The first byte we write out covers the first four
+		// words of the object. The scan/dead bit on the first
+		// word must be set to scan since there are pointers
+		// somewhere in the object. The scan/dead bit on the
+		// second word is the checkmark, so we don't set it.
+		// In all following words, we set the scan/dead
+		// appropriately to indicate that the object contains
+		// to the next 2-bit entry in the bitmap.
+		//
+		// TODO: It doesn't matter if we set the checkmark, so
+		// maybe this case isn't needed any more.
+		hb = b & bitPointerAll
+		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+		if w += 4; w >= nw {
+			goto Phase3
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+		nb -= 4
+
+	case sys.PtrSize == 8 && h.shift == 2:
+		// Ptrmask and heap bitmap are misaligned.
+		// The bits for the first two words are in a byte shared
+		// with another object, so we must be careful with the bits
+		// already there.
+		// We took care of 1-word and 2-word objects above,
+		// so this is at least a 6-word object.
+		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
+		// This is not noscan, so set the scan bit in the
+		// first word.
+		hb |= bitScan << (2 * heapBitsShift)
+		b >>= 2
+		nb -= 2
+		// Note: no bitScan for second word because that's
+		// the checkmark.
+		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
+		*hbitp |= uint8(hb)
+		hbitp = subtract1(hbitp)
+		if w += 2; w >= nw {
+			// We know that there is more data, because we handled 2-word objects above.
+			// This must be at least a 6-word object. If we're out of pointer words,
+			// mark no scan in next bitmap byte and finish.
+			hb = 0
+			w += 4
+			goto Phase3
+		}
+	}
+
+	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
+	// The loop computes the bits for that last write but does not execute the write;
+	// it leaves the bits in hb for processing by phase 3.
+	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
+	// use in the first half of the loop right now, and then we only adjust nb explicitly
+	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
+	nb -= 4
+	for {
+		// Emit bitmap byte.
+		// b has at least nb+4 bits, with one exception:
+		// if w+4 >= nw, then b has only nw-w bits,
+		// but we'll stop at the break and then truncate
+		// appropriately in Phase 3.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+
+		// Load more bits. b has nb right now.
+		if p != endp {
+			// Fast path: keep reading from ptrmask.
+			// nb unmodified: we just loaded 8 bits,
+			// and the next iteration will consume 8 bits,
+			// leaving us with the same nb the next time we're here.
+			if nb < 8 {
+				b |= uintptr(*p) << nb
+				p = add1(p)
+			} else {
+				// Reduce the number of bits in b.
+				// This is important if we skipped
+				// over a scalar tail, since nb could
+				// be larger than the bit width of b.
+				nb -= 8
+			}
+		} else if p == nil {
+			// Almost as fast path: track bit count and refill from pbits.
+			// For short repetitions.
+			if nb < 8 {
+				b |= pbits << nb
+				nb += endnb
+			}
+			nb -= 8 // for next iteration
+		} else {
+			// Slow path: reached end of ptrmask.
+			// Process final partial byte and rewind to start.
+			b |= uintptr(*p) << nb
+			nb += endnb
+			if nb < 8 {
+				b |= uintptr(*ptrmask) << nb
+				p = add1(ptrmask)
+			} else {
+				nb -= 8
+				p = ptrmask
+			}
+		}
+
+		// Emit bitmap byte.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+	}
+
+Phase3:
+	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
+	if w > nw {
+		// Counting the 4 entries in hb not yet written to memory,
+		// there are more entries than possible pointer slots.
+		// Discard the excess entries (can't be more than 3).
+		mask := uintptr(1)<<(4-(w-nw)) - 1
+		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
+	}
+
+	// Change nw from counting possibly-pointer words to total words in allocation.
+	nw = size / sys.PtrSize
+
+	// Write whole bitmap bytes.
+	// The first is hb, the rest are zero.
+	if w <= nw {
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		hb = 0 // for possible final half-byte below
+		for w += 4; w <= nw; w += 4 {
+			*hbitp = 0
+			hbitp = subtract1(hbitp)
+		}
+	}
+
+	// Write final partial bitmap byte if any.
+	// We know w > nw, or else we'd still be in the loop above.
+	// It can be bigger only due to the 4 entries in hb that it counts.
+	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
+	// and can discard the 4 sitting in hb.
+	// But if w == nw+2, we need to write first two in hb.
+	// The byte is shared with the next object, so be careful with
+	// existing bits.
+	if w == nw+2 {
+		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
+	}
+
+Phase4:
+	// Phase 4: all done, but perhaps double check.
+	if doubleCheck {
+		end := heapBitsForAddr(x + size)
+		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
+			println("ended at wrong bitmap byte for", *typ.string, "x", dataSize/typ.size)
+			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+			h0 := heapBitsForAddr(x)
+			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
+			throw("bad heapBitsSetType")
+		}
+
+		// Double-check that bits to be written were written correctly.
+		// Does not check that other bits were not written, unfortunately.
+		h := heapBitsForAddr(x)
+		nptr := typ.ptrdata / sys.PtrSize
+		ndata := typ.size / sys.PtrSize
+		count := dataSize / typ.size
+		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
+		for i := uintptr(0); i < size/sys.PtrSize; i++ {
+			j := i % ndata
+			var have, want uint8
+			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
+			if i >= totalptr {
+				want = 0 // deadmarker
+				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
+					want = bitScan
+				}
+			} else {
+				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+					want |= bitPointer
+				}
+				if i != 1 {
+					want |= bitScan
+				} else {
+					have &^= bitScan
+				}
+			}
+			if have != want {
+				println("mismatch writing bits for", *typ.string, "x", dataSize/typ.size)
+				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
+				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+				h0 := heapBitsForAddr(x)
+				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
+				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
+				if typ.kind&kindGCProg != 0 {
+					println("GC program:")
+					dumpGCProg(addb(typ.gcdata, 4))
+				}
+				throw("bad heapBitsSetType")
+			}
+			h = h.next()
+		}
+		if ptrmask == debugPtrmask.data {
+			unlock(&debugPtrmask.lock)
+		}
+	}
+}
+
+// heapBitsSetTypeNoScan marks x as noscan by setting the first word
+// of x in the heap bitmap to scalar/dead.
+func heapBitsSetTypeNoScan(x uintptr) {
+	h := heapBitsForAddr(uintptr(x))
+	*h.bitp &^= (bitPointer | bitScan) << h.shift
+}
+
+var debugPtrmask struct {
+	lock mutex
+	data *byte
+}
+
+// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
+// progSize is the size of the memory described by the program.
+// elemSize is the size of the element that the GC program describes (a prefix of).
+// dataSize is the total size of the intended data, a multiple of elemSize.
+// allocSize is the total size of the allocated memory.
+//
+// GC programs are only used for large allocations.
+// heapBitsSetType requires that allocSize is a multiple of 4 words,
+// so that the relevant bitmap bytes are not shared with surrounding
+// objects.
+func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
+	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
+		// Alignment will be wrong.
+		throw("heapBitsSetTypeGCProg: small allocation")
+	}
+	var totalBits uintptr
+	if elemSize == dataSize {
+		totalBits = runGCProg(prog, nil, h.bitp, 2)
+		if totalBits*sys.PtrSize != progSize {
+			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
+			throw("heapBitsSetTypeGCProg: unexpected bit count")
+		}
+	} else {
+		count := dataSize / elemSize
+
+		// Piece together program trailer to run after prog that does:
+		//	literal(0)
+		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
+		//	repeat(elemSize, count-1) // repeat that element for count
+		// This zero-pads the data remaining in the first element and then
+		// repeats that first element to fill the array.
+		var trailer [40]byte // 3 varints (max 10 each) + some bytes
+		i := 0
+		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
+			// literal(0)
+			trailer[i] = 0x01
+			i++
+			trailer[i] = 0
+			i++
+			if n > 1 {
+				// repeat(1, n-1)
+				trailer[i] = 0x81
+				i++
+				n--
+				for ; n >= 0x80; n >>= 7 {
+					trailer[i] = byte(n | 0x80)
+					i++
+				}
+				trailer[i] = byte(n)
+				i++
+			}
+		}
+		// repeat(elemSize/ptrSize, count-1)
+		trailer[i] = 0x80
+		i++
+		n := elemSize / sys.PtrSize
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
+		}
+		trailer[i] = byte(n)
+		i++
+		n = count - 1
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
+		}
+		trailer[i] = byte(n)
+		i++
+		trailer[i] = 0
+		i++
+
+		runGCProg(prog, &trailer[0], h.bitp, 2)
+
+		// Even though we filled in the full array just now,
+		// record that we only filled in up to the ptrdata of the
+		// last element. This will cause the code below to
+		// memclr the dead section of the final array element,
+		// so that scanobject can stop early in the final element.
+		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
+	}
+	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
+	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
+	memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
+}
+
+// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
+// size the size of the region described by prog, in bytes.
+// The resulting bitvector will have no more than size/sys.PtrSize bits.
+func progToPointerMask(prog *byte, size uintptr) bitvector {
+	n := (size/sys.PtrSize + 7) / 8
+	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
+	x[len(x)-1] = 0xa1 // overflow check sentinel
+	n = runGCProg(prog, nil, &x[0], 1)
+	if x[len(x)-1] != 0xa1 {
+		throw("progToPointerMask: overflow")
+	}
+	return bitvector{int32(n), &x[0]}
+}
+
+// Packed GC pointer bitmaps, aka GC programs.
+//
+// For large types containing arrays, the type information has a
+// natural repetition that can be encoded to save space in the
+// binary and in the memory representation of the type information.
+//
+// The encoding is a simple Lempel-Ziv style bytecode machine
+// with the following instructions:
+//
+//	00000000: stop
+//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
+//	10000000 n c: repeat the previous n bits c times; n, c are varints
+//	1nnnnnnn c: repeat the previous n bits c times; c is a varint
+
+// runGCProg executes the GC program prog, and then trailer if non-nil,
+// writing to dst with entries of the given size.
+// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
+// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
+// starting at dst (because the heap bitmap does). In this case, the caller guarantees
+// that only whole bytes in dst need to be written.
+//
+// runGCProg returns the number of 1- or 2-bit entries written to memory.
+func runGCProg(prog, trailer, dst *byte, size int) uintptr {
+	dstStart := dst
+
+	// Bits waiting to be written to memory.
+	var bits uintptr
+	var nbits uintptr
+
+	p := prog
+Run:
+	for {
+		// Flush accumulated full bytes.
+		// The rest of the loop assumes that nbits <= 7.
+		for ; nbits >= 8; nbits -= 8 {
+			if size == 1 {
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			} else {
+				v := bits&bitPointerAll | bitScanAll
+				*dst = uint8(v)
+				dst = subtract1(dst)
+				bits >>= 4
+				v = bits&bitPointerAll | bitScanAll
+				*dst = uint8(v)
+				dst = subtract1(dst)
+				bits >>= 4
+			}
+		}
+
+		// Process one instruction.
+		inst := uintptr(*p)
+		p = add1(p)
+		n := inst & 0x7F
+		if inst&0x80 == 0 {
+			// Literal bits; n == 0 means end of program.
+			if n == 0 {
+				// Program is over; continue in trailer if present.
+				if trailer != nil {
+					//println("trailer")
+					p = trailer
+					trailer = nil
+					continue
+				}
+				//println("done")
+				break Run
+			}
+			//println("lit", n, dst)
+			nbyte := n / 8
+			for i := uintptr(0); i < nbyte; i++ {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				if size == 1 {
+					*dst = uint8(bits)
+					dst = add1(dst)
+					bits >>= 8
+				} else {
+					v := bits&0xf | bitScanAll
+					*dst = uint8(v)
+					dst = subtract1(dst)
+					bits >>= 4
+					v = bits&0xf | bitScanAll
+					*dst = uint8(v)
+					dst = subtract1(dst)
+					bits >>= 4
+				}
+			}
+			if n %= 8; n > 0 {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				nbits += n
+			}
+			continue Run
+		}
+
+		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
+		if n == 0 {
+			for off := uint(0); ; off += 7 {
+				x := uintptr(*p)
+				p = add1(p)
+				n |= (x & 0x7F) << off
+				if x&0x80 == 0 {
+					break
+				}
+			}
+		}
+
+		// Count is encoded in a varint in the next bytes.
+		c := uintptr(0)
+		for off := uint(0); ; off += 7 {
+			x := uintptr(*p)
+			p = add1(p)
+			c |= (x & 0x7F) << off
+			if x&0x80 == 0 {
+				break
+			}
+		}
+		c *= n // now total number of bits to copy
+
+		// If the number of bits being repeated is small, load them
+		// into a register and use that register for the entire loop
+		// instead of repeatedly reading from memory.
+		// Handling fewer than 8 bits here makes the general loop simpler.
+		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
+		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
+		// it will not overflow.
+		src := dst
+		const maxBits = sys.PtrSize*8 - 7
+		if n <= maxBits {
+			// Start with bits in output buffer.
+			pattern := bits
+			npattern := nbits
+
+			// If we need more bits, fetch them from memory.
+			if size == 1 {
+				src = subtract1(src)
+				for npattern < n {
+					pattern <<= 8
+					pattern |= uintptr(*src)
+					src = subtract1(src)
+					npattern += 8
+				}
+			} else {
+				src = add1(src)
+				for npattern < n {
+					pattern <<= 4
+					pattern |= uintptr(*src) & 0xf
+					src = add1(src)
+					npattern += 4
+				}
+			}
+
+			// We started with the whole bit output buffer,
+			// and then we loaded bits from whole bytes.
+			// Either way, we might now have too many instead of too few.
+			// Discard the extra.
+			if npattern > n {
+				pattern >>= npattern - n
+				npattern = n
+			}
+
+			// Replicate pattern to at most maxBits.
+			if npattern == 1 {
+				// One bit being repeated.
+				// If the bit is 1, make the pattern all 1s.
+				// If the bit is 0, the pattern is already all 0s,
+				// but we can claim that the number of bits
+				// in the word is equal to the number we need (c),
+				// because right shift of bits will zero fill.
+				if pattern == 1 {
+					pattern = 1<<maxBits - 1
+					npattern = maxBits
+				} else {
+					npattern = c
+				}
+			} else {
+				b := pattern
+				nb := npattern
+				if nb+nb <= maxBits {
+					// Double pattern until the whole uintptr is filled.
+					for nb <= sys.PtrSize*8 {
+						b |= b << nb
+						nb += nb
+					}
+					// Trim away incomplete copy of original pattern in high bits.
+					// TODO(rsc): Replace with table lookup or loop on systems without divide?
+					nb = maxBits / npattern * npattern
+					b &= 1<<nb - 1
+					pattern = b
+					npattern = nb
+				}
+			}
+
+			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
+			// Since pattern contains >8 bits, there will be full bytes to flush
+			// on each iteration.
+			for ; c >= npattern; c -= npattern {
+				bits |= pattern << nbits
+				nbits += npattern
+				if size == 1 {
+					for nbits >= 8 {
+						*dst = uint8(bits)
+						dst = add1(dst)
+						bits >>= 8
+						nbits -= 8
+					}
+				} else {
+					for nbits >= 4 {
+						*dst = uint8(bits&0xf | bitScanAll)
+						dst = subtract1(dst)
+						bits >>= 4
+						nbits -= 4
+					}
+				}
+			}
+
+			// Add final fragment to bit buffer.
+			if c > 0 {
+				pattern &= 1<<c - 1
+				bits |= pattern << nbits
+				nbits += c
+			}
+			continue Run
+		}
+
+		// Repeat; n too large to fit in a register.
+		// Since nbits <= 7, we know the first few bytes of repeated data
+		// are already written to memory.
+		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
+		if size == 1 {
+			// Leading src fragment.
+			src = subtractb(src, (off+7)/8)
+			if frag := off & 7; frag != 0 {
+				bits |= uintptr(*src) >> (8 - frag) << nbits
+				src = add1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 8; i > 0; i-- {
+				bits |= uintptr(*src) << nbits
+				src = add1(src)
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			}
+			// Final src fragment.
+			if c %= 8; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		} else {
+			// Leading src fragment.
+			src = addb(src, (off+3)/4)
+			if frag := off & 3; frag != 0 {
+				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
+				src = subtract1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 4; i > 0; i-- {
+				bits |= (uintptr(*src) & 0xf) << nbits
+				src = subtract1(src)
+				*dst = uint8(bits&0xf | bitScanAll)
+				dst = subtract1(dst)
+				bits >>= 4
+			}
+			// Final src fragment.
+			if c %= 4; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		}
+	}
+
+	// Write any final bits out, using full-byte writes, even for the final byte.
+	var totalBits uintptr
+	if size == 1 {
+		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
+		nbits += -nbits & 7
+		for ; nbits > 0; nbits -= 8 {
+			*dst = uint8(bits)
+			dst = add1(dst)
+			bits >>= 8
+		}
+	} else {
+		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
+		nbits += -nbits & 3
+		for ; nbits > 0; nbits -= 4 {
+			v := bits&0xf | bitScanAll
+			*dst = uint8(v)
+			dst = subtract1(dst)
+			bits >>= 4
+		}
+	}
+	return totalBits
+}
+
+func dumpGCProg(p *byte) {
+	nptr := 0
+	for {
+		x := *p
+		p = add1(p)
+		if x == 0 {
+			print("\t", nptr, " end\n")
+			break
+		}
+		if x&0x80 == 0 {
+			print("\t", nptr, " lit ", x, ":")
+			n := int(x+7) / 8
+			for i := 0; i < n; i++ {
+				print(" ", hex(*p))
+				p = add1(p)
+			}
+			print("\n")
+			nptr += int(x)
+		} else {
+			nbit := int(x &^ 0x80)
+			if nbit == 0 {
+				for nb := uint(0); ; nb += 7 {
+					x := *p
+					p = add1(p)
+					nbit |= int(x&0x7f) << nb
+					if x&0x80 == 0 {
+						break
+					}
+				}
+			}
+			count := 0
+			for nb := uint(0); ; nb += 7 {
+				x := *p
+				p = add1(p)
+				count |= int(x&0x7f) << nb
+				if x&0x80 == 0 {
+					break
+				}
+			}
+			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
+			nptr += nbit * count
+		}
+	}
+}
+
+// Testing.
+
+// gcbits returns the GC type info for x, for testing.
+// The result is the bitmap entries (0 or 1), one entry per byte.
+//go:linkname reflect_gcbits reflect.gcbits
+func reflect_gcbits(x interface{}) []byte {
+	ret := getgcmask(x)
+	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
+	nptr := typ.ptrdata / sys.PtrSize
+	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
+		ret = ret[:len(ret)-1]
+	}
+	return ret
+}
+
+// Returns GC type info for object p for testing.
+func getgcmask(ep interface{}) (mask []byte) {
+	e := *efaceOf(&ep)
+	p := e.data
+	t := e._type
+	// data or bss
+	roots := gcRoots
+	for roots != nil {
+		for i := 0; i < roots.count; i++ {
+			pr := roots.roots[i]
+			addr := uintptr(pr.decl)
+			if addr <= uintptr(p) && uintptr(p) < addr+pr.size {
+				n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+				mask = make([]byte, n/sys.PtrSize)
+				copy(mask, (*[1 << 29]uint8)(unsafe.Pointer(pr.gcdata))[:pr.ptrdata])
+			}
+			return
+		}
+		roots = roots.next
+	}
+
+	// heap
+	var n uintptr
+	var base uintptr
+	if mlookup(uintptr(p), &base, &n, nil) != 0 {
+		mask = make([]byte, n/sys.PtrSize)
+		for i := uintptr(0); i < n; i += sys.PtrSize {
+			hbits := heapBitsForAddr(base + i)
+			if hbits.isPointer() {
+				mask[i/sys.PtrSize] = 1
+			}
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
+				mask = mask[:i/sys.PtrSize]
+				break
+			}
+		}
+		return
+	}
+
+	// otherwise, not something the GC knows about.
+	// possibly read-only data, like malloc(0).
+	// must not have pointers
+	// For gccgo, may live on the stack, which is collected conservatively.
+	return
+}
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index b65dd37..92dabef 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -4,16 +4,8 @@
 
 package runtime
 
-// This is a temporary mcache.go for gccgo.
-// At some point it will be replaced by the one in the gc runtime package.
-
 import "unsafe"
 
-type mcachelist struct {
-	list  *mlink
-	nlist uint32
-}
-
 // Per-thread (in Go, per-P) cache for small objects.
 // No locking needed because it is per-thread (per-P).
 //
@@ -24,8 +16,8 @@
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample      int32   // trigger heap sample after allocating this many bytes
-	local_cachealloc uintptr // bytes allocated (or freed) from cache since last lock of heap
+	next_sample int32   // trigger heap sample after allocating this many bytes
+	local_scan  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -36,12 +28,12 @@
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny     unsafe.Pointer
-	tinysize uintptr
+	tiny             uintptr
+	tinyoffset       uintptr
+	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
 
 	// The rest is not accessed on every malloc.
-	alloc [_NumSizeClasses]*mspan     // spans to allocate from
-	free  [_NumSizeClasses]mcachelist // lists of explicitly freed objects
+	alloc [_NumSizeClasses]*mspan // spans to allocate from
 
 	// Local allocator stats, flushed during GC.
 	local_nlookup    uintptr                  // number of pointer lookups
@@ -50,46 +42,98 @@
 	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
 }
 
-type mtypes struct {
-	compression byte
-	data        uintptr
+// A gclink is a node in a linked list of blocks, like mlink,
+// but it is opaque to the garbage collector.
+// The GC does not trace the pointers during collection,
+// and the compiler does not emit write barriers for assignments
+// of gclinkptr values. Code should store references to gclinks
+// as gclinkptr, not as *gclink.
+type gclink struct {
+	next gclinkptr
 }
 
-type special struct {
-	next   *special
-	offset uint16
-	kind   byte
+// A gclinkptr is a pointer to a gclink, but it is opaque
+// to the garbage collector.
+type gclinkptr uintptr
+
+// ptr returns the *gclink form of p.
+// The result should be used for accessing fields, not stored
+// in other data structures.
+func (p gclinkptr) ptr() *gclink {
+	return (*gclink)(unsafe.Pointer(p))
 }
 
-type mspan struct {
-	next     *mspan // next span in list, or nil if none
-	prev     *mspan // previous span's next field, or list head's first field if none
-	start    uintptr
-	npages   uintptr // number of pages in span
-	freelist *mlink
+// dummy MSpan that contains no free objects.
+var emptymspan mspan
 
-	// sweep generation:
-	// if sweepgen == h->sweepgen - 2, the span needs sweeping
-	// if sweepgen == h->sweepgen - 1, the span is currently being swept
-	// if sweepgen == h->sweepgen, the span is swept and ready to use
-	// h->sweepgen is incremented by 2 after every GC
-
-	sweepgen    uint32
-	ref         uint16
-	sizeclass   uint8   // size class
-	incache     bool    // being used by an mcache
-	state       uint8   // mspaninuse etc
-	needzero    uint8   // needs to be zeroed before allocation
-	elemsize    uintptr // computed from sizeclass or from npages
-	unusedsince int64   // first time spotted by gc in mspanfree state
-	npreleased  uintptr // number of pages released to the os
-	limit       uintptr // end of data in span
-	types       mtypes
-	speciallock mutex    // guards specials list
-	specials    *special // linked list of special records sorted by offset.
-	freebuf     *mlink
+func allocmcache() *mcache {
+	lock(&mheap_.lock)
+	c := (*mcache)(mheap_.cachealloc.alloc())
+	unlock(&mheap_.lock)
+	for i := 0; i < _NumSizeClasses; i++ {
+		c.alloc[i] = &emptymspan
+	}
+	c.next_sample = nextSample()
+	return c
 }
 
-type mlink struct {
-	next *mlink
+func freemcache(c *mcache) {
+	systemstack(func() {
+		c.releaseAll()
+
+		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
+		// with the stealing of gcworkbufs during garbage collection to avoid
+		// a race where the workbuf is double-freed.
+		// gcworkbuffree(c.gcworkbuf)
+
+		lock(&mheap_.lock)
+		purgecachedstats(c)
+		mheap_.cachealloc.free(unsafe.Pointer(c))
+		unlock(&mheap_.lock)
+	})
+}
+
+// Gets a span that has a free object in it and assigns it
+// to be the cached span for the given sizeclass. Returns this span.
+func (c *mcache) refill(sizeclass int32) *mspan {
+	_g_ := getg()
+
+	_g_.m.locks++
+	// Return the current cached span to the central lists.
+	s := c.alloc[sizeclass]
+
+	if uintptr(s.allocCount) != s.nelems {
+		throw("refill of span with free space remaining")
+	}
+
+	if s != &emptymspan {
+		s.incache = false
+	}
+
+	// Get a new cached span from the central lists.
+	s = mheap_.central[sizeclass].mcentral.cacheSpan()
+	if s == nil {
+		throw("out of memory")
+	}
+
+	if uintptr(s.allocCount) == s.nelems {
+		throw("span has no free space")
+	}
+
+	c.alloc[sizeclass] = s
+	_g_.m.locks--
+	return s
+}
+
+func (c *mcache) releaseAll() {
+	for i := 0; i < _NumSizeClasses; i++ {
+		s := c.alloc[i]
+		if s != &emptymspan {
+			mheap_.central[i].mcentral.uncacheSpan(s)
+			c.alloc[i] = &emptymspan
+		}
+	}
+	// Clear tinyalloc pool.
+	c.tiny = 0
+	c.tinyoffset = 0
 }
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
new file mode 100644
index 0000000..ddcf81e
--- /dev/null
+++ b/libgo/go/runtime/mcentral.go
@@ -0,0 +1,222 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Central free lists.
+//
+// See malloc.go for an overview.
+//
+// The MCentral doesn't actually contain the list of free objects; the MSpan does.
+// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
+// and those that are completely allocated (c->empty).
+
+package runtime
+
+import "runtime/internal/atomic"
+
+// Central list of free objects of a given size.
+//
+//go:notinheap
+type mcentral struct {
+	lock      mutex
+	sizeclass int32
+	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
+	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
+}
+
+// Initialize a single central free list.
+func (c *mcentral) init(sizeclass int32) {
+	c.sizeclass = sizeclass
+	c.nonempty.init()
+	c.empty.init()
+}
+
+// Allocate a span to use in an MCache.
+func (c *mcentral) cacheSpan() *mspan {
+	// Deduct credit for this span allocation and sweep if necessary.
+	spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize
+	deductSweepCredit(spanBytes, 0)
+
+	lock(&c.lock)
+	sg := mheap_.sweepgen
+retry:
+	var s *mspan
+	for s = c.nonempty.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			c.nonempty.remove(s)
+			c.empty.insertBack(s)
+			unlock(&c.lock)
+			s.sweep(true)
+			goto havespan
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being swept by background sweeper, skip
+			continue
+		}
+		// we have a nonempty span that does not require sweeping, allocate from it
+		c.nonempty.remove(s)
+		c.empty.insertBack(s)
+		unlock(&c.lock)
+		goto havespan
+	}
+
+	for s = c.empty.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			// we have an empty span that requires sweeping,
+			// sweep it and see if we can free some space in it
+			c.empty.remove(s)
+			// swept spans are at the end of the list
+			c.empty.insertBack(s)
+			unlock(&c.lock)
+			s.sweep(true)
+			freeIndex := s.nextFreeIndex()
+			if freeIndex != s.nelems {
+				s.freeindex = freeIndex
+				goto havespan
+			}
+			lock(&c.lock)
+			// the span is still empty after sweep
+			// it is already in the empty list, so just retry
+			goto retry
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being swept by background sweeper, skip
+			continue
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break
+	}
+	unlock(&c.lock)
+
+	// Replenish central list if empty.
+	s = c.grow()
+	if s == nil {
+		return nil
+	}
+	lock(&c.lock)
+	c.empty.insertBack(s)
+	unlock(&c.lock)
+
+	// At this point s is a non-empty span, queued at the end of the empty list,
+	// c is unlocked.
+havespan:
+	cap := int32((s.npages << _PageShift) / s.elemsize)
+	n := cap - int32(s.allocCount)
+	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
+		throw("span has no free objects")
+	}
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	if usedBytes > 0 {
+		reimburseSweepCredit(usedBytes)
+	}
+	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+	s.incache = true
+	freeByteBase := s.freeindex &^ (64 - 1)
+	whichByte := freeByteBase / 8
+	// Init alloc bits cache.
+	s.refillAllocCache(whichByte)
+
+	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
+	// s.allocCache.
+	s.allocCache >>= s.freeindex % 64
+
+	return s
+}
+
+// Return span from an MCache.
+func (c *mcentral) uncacheSpan(s *mspan) {
+	lock(&c.lock)
+
+	s.incache = false
+
+	if s.allocCount == 0 {
+		throw("uncaching span but s.allocCount == 0")
+	}
+
+	cap := int32((s.npages << _PageShift) / s.elemsize)
+	n := cap - int32(s.allocCount)
+	if n > 0 {
+		c.empty.remove(s)
+		c.nonempty.insert(s)
+		// mCentral_CacheSpan conservatively counted
+		// unallocated slots in heap_live. Undo this.
+		atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+	}
+	unlock(&c.lock)
+}
+
+// freeSpan updates c and s after sweeping s.
+// It sets s's sweepgen to the latest generation,
+// and, based on the number of free objects in s,
+// moves s to the appropriate list of c or returns it
+// to the heap.
+// freeSpan returns true if s was returned to the heap.
+// If preserve=true, it does not move s (the caller
+// must take care of it).
+func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
+	if s.incache {
+		throw("freeSpan given cached span")
+	}
+	s.needzero = 1
+
+	if preserve {
+		// preserve is set only when called from MCentral_CacheSpan above,
+		// the span must be in the empty list.
+		if !s.inList() {
+			throw("can't preserve unlinked span")
+		}
+		atomic.Store(&s.sweepgen, mheap_.sweepgen)
+		return false
+	}
+
+	lock(&c.lock)
+
+	// Move to nonempty if necessary.
+	if wasempty {
+		c.empty.remove(s)
+		c.nonempty.insert(s)
+	}
+
+	// delay updating sweepgen until here. This is the signal that
+	// the span may be used in an MCache, so it must come after the
+	// linked list operations above (actually, just after the
+	// lock of c above.)
+	atomic.Store(&s.sweepgen, mheap_.sweepgen)
+
+	if s.allocCount != 0 {
+		unlock(&c.lock)
+		return false
+	}
+
+	c.nonempty.remove(s)
+	unlock(&c.lock)
+	mheap_.freeSpan(s, 0)
+	return true
+}
+
+// grow allocates a new empty span from the heap and initializes it for c's size class.
+func (c *mcentral) grow() *mspan {
+	npages := uintptr(class_to_allocnpages[c.sizeclass])
+	size := uintptr(class_to_size[c.sizeclass])
+	n := (npages << _PageShift) / size
+
+	s := mheap_.alloc(npages, c.sizeclass, false, true)
+	if s == nil {
+		return nil
+	}
+
+	p := s.base()
+	s.limit = p + size*n
+
+	heapBitsForSpan(s.base()).initSpan(s)
+	return s
+}
diff --git a/libgo/go/runtime/mem_gccgo.go b/libgo/go/runtime/mem_gccgo.go
new file mode 100644
index 0000000..161ff26
--- /dev/null
+++ b/libgo/go/runtime/mem_gccgo.go
@@ -0,0 +1,280 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The gccgo version of mem_*.go.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Functions called by C code.
+//go:linkname sysAlloc runtime.sysAlloc
+
+//extern mmap
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) unsafe.Pointer
+
+//extern munmap
+func munmap(addr unsafe.Pointer, length uintptr) int32
+
+//extern mincore
+func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+
+//extern madvise
+func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
+
+var mmapFD = int32(-1)
+
+var devZero = []byte("/dev/zero\x00")
+
+func init() {
+	if _MAP_ANON == 0 {
+		mmapFD = open(&devZero[0], 0 /* O_RDONLY */, 0)
+		if mmapFD < 0 {
+			println("open /dev/zero: errno=", errno())
+			exit(2)
+		}
+	}
+}
+
+// NOTE: vec must be just 1 byte long here.
+// Mincore returns ENOMEM if any of the pages are unmapped,
+// but we want to know that all of the pages are unmapped.
+// To make these the same, we can only ask about one page
+// at a time. See golang.org/issue/7476.
+var addrspace_vec [1]byte
+
+func addrspace_free(v unsafe.Pointer, n uintptr) bool {
+	for off := uintptr(0); off < n; off += physPageSize {
+		// Use a length of 1 byte, which the kernel will round
+		// up to one physical page regardless of the true
+		// physical page size.
+		errval := 0
+		if mincore(unsafe.Pointer(uintptr(v)+off), 1, &addrspace_vec[0]) < 0 {
+			errval = errno()
+		}
+		if errval == _ENOSYS {
+			// mincore is not available on this system.
+			// Assume the address is available.
+			return true
+		}
+		if errval == _EINVAL {
+			// Address is not a multiple of the physical
+			// page size. Shouldn't happen, but just ignore it.
+			continue
+		}
+		// ENOMEM means unmapped, which is what we want.
+		// Anything else we assume means the pages are mapped.
+		if errval != _ENOMEM {
+			return false
+		}
+	}
+	return true
+}
+
+func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uintptr) unsafe.Pointer {
+	p := mmap(v, n, prot, flags, fd, offset)
+	// On some systems, mmap ignores v without
+	// MAP_FIXED, so retry if the address space is free.
+	if p != v && addrspace_free(v, n) {
+		if uintptr(p) != _MAP_FAILED {
+			munmap(p, n)
+		}
+		p = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
+	}
+	return p
+}
+
+// Don't split the stack as this method may be invoked without a valid G, which
+// prevents us from allocating more stack.
+//go:nosplit
+func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+	p := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if uintptr(p) == _MAP_FAILED {
+		errval := errno()
+		if errval == _EACCES {
+			print("runtime: mmap: access denied\n")
+			exit(2)
+		}
+		if errval == _EAGAIN {
+			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
+			exit(2)
+		}
+		return nil
+	}
+	mSysStatInc(sysStat, n)
+	return p
+}
+
+func sysUnused(v unsafe.Pointer, n uintptr) {
+	// By default, Linux's "transparent huge page" support will
+	// merge pages into a huge page if there's even a single
+	// present regular page, undoing the effects of the DONTNEED
+	// below. On amd64, that means khugepaged can turn a single
+	// 4KB page to 2MB, bloating the process's RSS by as much as
+	// 512X. (See issue #8832 and Linux kernel bug
+	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
+	//
+	// To work around this, we explicitly disable transparent huge
+	// pages when we release pages of the heap. However, we have
+	// to do this carefully because changing this flag tends to
+	// split the VMA (memory mapping) containing v in to three
+	// VMAs in order to track the different values of the
+	// MADV_NOHUGEPAGE flag in the different regions. There's a
+	// default limit of 65530 VMAs per address space (sysctl
+	// vm.max_map_count), so we must be careful not to create too
+	// many VMAs (see issue #12233).
+	//
+	// Since huge pages are huge, there's little use in adjusting
+	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
+	// exploding the number of VMAs by only adjusting the
+	// MADV_NOHUGEPAGE flag on a large granularity. This still
+	// gets most of the benefit of huge pages while keeping the
+	// number of VMAs under control. With hugePageSize = 2MB, even
+	// a pessimal heap can reach 128GB before running out of VMAs.
+	if sys.HugePageSize != 0 && _MADV_NOHUGEPAGE != 0 {
+		var s uintptr = sys.HugePageSize // division by constant 0 is a compile-time error :(
+
+		// If it's a large allocation, we want to leave huge
+		// pages enabled. Hence, we only adjust the huge page
+		// flag on the huge pages containing v and v+n-1, and
+		// only if those aren't aligned.
+		var head, tail uintptr
+		if uintptr(v)%s != 0 {
+			// Compute huge page containing v.
+			head = uintptr(v) &^ (s - 1)
+		}
+		if (uintptr(v)+n)%s != 0 {
+			// Compute huge page containing v+n-1.
+			tail = (uintptr(v) + n - 1) &^ (s - 1)
+		}
+
+		// Note that madvise will return EINVAL if the flag is
+		// already set, which is quite likely. We ignore
+		// errors.
+		if head != 0 && head+sys.HugePageSize == tail {
+			// head and tail are different but adjacent,
+			// so do this in one call.
+			madvise(unsafe.Pointer(head), 2*sys.HugePageSize, _MADV_NOHUGEPAGE)
+		} else {
+			// Advise the huge pages containing v and v+n-1.
+			if head != 0 {
+				madvise(unsafe.Pointer(head), sys.HugePageSize, _MADV_NOHUGEPAGE)
+			}
+			if tail != 0 && tail != head {
+				madvise(unsafe.Pointer(tail), sys.HugePageSize, _MADV_NOHUGEPAGE)
+			}
+		}
+	}
+
+	if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 {
+		// madvise will round this to any physical page
+		// *covered* by this range, so an unaligned madvise
+		// will release more memory than intended.
+		throw("unaligned sysUnused")
+	}
+
+	if _MADV_DONTNEED != 0 {
+		madvise(v, n, _MADV_DONTNEED)
+	} else if _MADV_FREE != 0 {
+		madvise(v, n, _MADV_FREE)
+	}
+}
+
+func sysUsed(v unsafe.Pointer, n uintptr) {
+	if sys.HugePageSize != 0 && _MADV_HUGEPAGE != 0 {
+		// Partially undo the NOHUGEPAGE marks from sysUnused
+		// for whole huge pages between v and v+n. This may
+		// leave huge pages off at the end points v and v+n
+		// even though allocations may cover these entire huge
+		// pages. We could detect this and undo NOHUGEPAGE on
+		// the end points as well, but it's probably not worth
+		// the cost because when neighboring allocations are
+		// freed sysUnused will just set NOHUGEPAGE again.
+		var s uintptr = sys.HugePageSize
+
+		// Round v up to a huge page boundary.
+		beg := (uintptr(v) + (s - 1)) &^ (s - 1)
+		// Round v+n down to a huge page boundary.
+		end := (uintptr(v) + n) &^ (s - 1)
+
+		if beg < end {
+			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
+		}
+	}
+}
+
+// Don't split the stack as this function may be invoked without a valid G,
+// which prevents us from allocating more stack.
+//go:nosplit
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
+	mSysStatDec(sysStat, n)
+	munmap(v, n)
+}
+
+func sysFault(v unsafe.Pointer, n uintptr) {
+	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, mmapFD, 0)
+}
+
+func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space. Instead, assume that the reservation is okay
+	// if we can reserve at least 64K and check the assumption in SysMap.
+	// Only user-mode Linux (UML) rejects these requests.
+	if sys.PtrSize == 8 && uint64(n) > 1<<32 {
+		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+		if p != v {
+			if uintptr(p) != _MAP_FAILED {
+				munmap(p, 64<<10)
+			}
+			return nil
+		}
+		munmap(p, 64<<10)
+		*reserved = false
+		return v
+	}
+
+	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if uintptr(p) == _MAP_FAILED {
+		return nil
+	}
+	*reserved = true
+	return p
+}
+
+func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
+	mSysStatInc(sysStat, n)
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if !reserved {
+		flags := int32(_MAP_ANON | _MAP_PRIVATE)
+		if GOOS == "dragonfly" {
+			// TODO(jsing): For some reason DragonFly seems to return
+			// memory at a different address than we requested, even when
+			// there should be no reason for it to do so. This can be
+			// avoided by using MAP_FIXED, but I'm not sure we should need
+			// to do this - we do not on other platforms.
+			flags |= _MAP_FIXED
+		}
+		p := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, flags, mmapFD, 0)
+		if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+			throw("runtime: out of memory")
+		}
+		if p != v {
+			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+			throw("runtime: address space conflict")
+		}
+		return
+	}
+
+	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, mmapFD, 0)
+	if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+		throw("runtime: out of memory")
+	}
+	if p != v {
+		throw("runtime: cannot map pages in arena address space")
+	}
+}
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
index 34bef2a..f0123b3 100644
--- a/libgo/go/runtime/mfinal.go
+++ b/libgo/go/runtime/mfinal.go
@@ -12,14 +12,6 @@
 	"unsafe"
 )
 
-// Functions still in C.
-func addfinalizer(p unsafe.Pointer, f *funcval, ft *functype, ot *ptrtype) bool
-func removefinalizer(p unsafe.Pointer)
-
-// Temporary for calling from C code.
-//go:linkname queuefinalizer runtime.queuefinalizer
-//go:linkname iterate_finq runtime.iterate_finq
-
 // finblock is allocated from non-GC'd memory, so any heap pointers
 // must be specially handled.
 //
@@ -65,9 +57,6 @@
 				// pointer mask to use while scanning.
 				// Since all the values in finalizer are
 				// pointers, just turn all bits on.
-				//
-				// Note for gccgo: this is not used yet,
-				// but will be used soon with the new GC.
 				for i := range finptrmask {
 					finptrmask[i] = 0xff
 				}
@@ -287,7 +276,18 @@
 			return
 		}
 
-		throw("runtime.SetFinalizer: pointer not in allocated block")
+		// Global initializers might be linker-allocated.
+		//	var Foo = &Object{}
+		//	func main() {
+		//		runtime.SetFinalizer(Foo, nil)
+		//	}
+		// The relevant segments are: noptrdata, data, bss, noptrbss.
+		// We cannot assume they are in any order or even contiguous,
+		// due to external linking.
+		//
+		// For gccgo we have no reliable way to detect them,
+		// so we just return.
+		return
 	}
 
 	if e.data != base {
@@ -355,18 +355,44 @@
 	})
 }
 
-//extern runtime_mlookup
-func runtime_mlookup(unsafe.Pointer, **byte, *uintptr, **mspan) int32
-
 // Look up pointer v in heap. Return the span containing the object,
 // the start of the object, and the size of the object. If the object
 // does not exist, return nil, nil, 0.
 func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
-	var base *byte
-	if runtime_mlookup(v, &base, &n, &s) == 0 {
-		return nil, nil, 0
+	c := gomcache()
+	c.local_nlookup++
+	if sys.PtrSize == 4 && c.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(c)
+		unlock(&mheap_.lock)
 	}
-	return s, unsafe.Pointer(base), n
+
+	// find span
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+	if uintptr(v) < arena_start || uintptr(v) >= arena_used {
+		return
+	}
+	p := uintptr(v) >> pageShift
+	q := p - arena_start>>pageShift
+	s = mheap_.spans[q]
+	if s == nil {
+		return
+	}
+	x = unsafe.Pointer(s.base())
+
+	if uintptr(v) < uintptr(x) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != mSpanInUse {
+		s = nil
+		x = nil
+		return
+	}
+
+	n = s.elemsize
+	if s.sizeclass != 0 {
+		x = add(x, (uintptr(v)-uintptr(x))/n*n)
+	}
+	return
 }
 
 // Mark KeepAlive as noinline so that the current compiler will ensure
diff --git a/libgo/go/runtime/mfixalloc.go b/libgo/go/runtime/mfixalloc.go
new file mode 100644
index 0000000..fe4b0fc
--- /dev/null
+++ b/libgo/go/runtime/mfixalloc.go
@@ -0,0 +1,99 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fixed-size object allocator. Returned memory is not zeroed.
+//
+// See malloc.go for overview.
+
+package runtime
+
+import "unsafe"
+
+// FixAlloc is a simple free-list allocator for fixed size objects.
+// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+// MCache and MSpan objects.
+//
+// Memory returned by fixalloc.alloc is zeroed by default, but the
+// caller may take responsibility for zeroing allocations by setting
+// the zero flag to false. This is only safe if the memory never
+// contains heap pointers.
+//
+// The caller is responsible for locking around FixAlloc calls.
+// Callers can keep state in the object but the first word is
+// smashed by freeing and reallocating.
+//
+// Consider marking fixalloc'd types go:notinheap.
+type fixalloc struct {
+	size   uintptr
+	first  func(arg, p unsafe.Pointer) // called first time p is returned
+	arg    unsafe.Pointer
+	list   *mlink
+	chunk  unsafe.Pointer
+	nchunk uint32
+	inuse  uintptr // in-use bytes now
+	stat   *uint64
+	zero   bool // zero allocations
+}
+
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+// Since assignments to mlink.next will result in a write barrier being performed
+// this cannot be used by some of the internal GC structures. For example when
+// the sweeper is placing an unmarked object on the free list it does not want the
+// write barrier to be called since that could result in the object being reachable.
+//
+//go:notinheap
+type mlink struct {
+	next *mlink
+}
+
+// Initialize f to allocate objects of the given size,
+// using the allocator to obtain chunks of memory.
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+	f.size = size
+	f.first = first
+	f.arg = arg
+	f.list = nil
+	f.chunk = nil
+	f.nchunk = 0
+	f.inuse = 0
+	f.stat = stat
+	f.zero = true
+}
+
+func (f *fixalloc) alloc() unsafe.Pointer {
+	if f.size == 0 {
+		print("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n")
+		throw("runtime: internal error")
+	}
+
+	if f.list != nil {
+		v := unsafe.Pointer(f.list)
+		f.list = f.list.next
+		f.inuse += f.size
+		if f.zero {
+			memclrNoHeapPointers(v, f.size)
+		}
+		return v
+	}
+	if uintptr(f.nchunk) < f.size {
+		f.chunk = persistentalloc(_FixAllocChunk, 0, f.stat)
+		f.nchunk = _FixAllocChunk
+	}
+
+	v := f.chunk
+	if f.first != nil {
+		f.first(f.arg, v)
+	}
+	f.chunk = add(f.chunk, f.size)
+	f.nchunk -= uint32(f.size)
+	f.inuse += f.size
+	return v
+}
+
+func (f *fixalloc) free(p unsafe.Pointer) {
+	f.inuse -= f.size
+	v := (*mlink)(p)
+	v.next = f.list
+	f.list = v
+}
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
new file mode 100644
index 0000000..abec9d3
--- /dev/null
+++ b/libgo/go/runtime/mgc.go
@@ -0,0 +1,1963 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector (GC).
+//
+// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
+// GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
+// non-generational and non-compacting. Allocation is done using size segregated per P allocation
+// areas to minimize fragmentation while eliminating locks in the common case.
+//
+// The algorithm decomposes into several steps.
+// This is a high level description of the algorithm being used. For an overview of GC a good
+// place to start is Richard Jones' gchandbook.org.
+//
+// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
+// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
+// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
+// 966-975.
+// For journal quality proofs that these steps are complete, correct, and terminate see
+// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
+// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
+//
+// 1. GC performs sweep termination.
+//
+//    a. Stop the world. This causes all Ps to reach a GC safe-point.
+//
+//    b. Sweep any unswept spans. There will only be unswept spans if
+//    this GC cycle was forced before the expected time.
+//
+// 2. GC performs the "mark 1" sub-phase. In this sub-phase, Ps are
+// allowed to locally cache parts of the work queue.
+//
+//    a. Prepare for the mark phase by setting gcphase to _GCmark
+//    (from _GCoff), enabling the write barrier, enabling mutator
+//    assists, and enqueueing root mark jobs. No objects may be
+//    scanned until all Ps have enabled the write barrier, which is
+//    accomplished using STW.
+//
+//    b. Start the world. From this point, GC work is done by mark
+//    workers started by the scheduler and by assists performed as
+//    part of allocation. The write barrier shades both the
+//    overwritten pointer and the new pointer value for any pointer
+//    writes (see mbarrier.go for details). Newly allocated objects
+//    are immediately marked black.
+//
+//    c. GC performs root marking jobs. This includes scanning all
+//    stacks, shading all globals, and shading any heap pointers in
+//    off-heap runtime data structures. Scanning a stack stops a
+//    goroutine, shades any pointers found on its stack, and then
+//    resumes the goroutine.
+//
+//    d. GC drains the work queue of grey objects, scanning each grey
+//    object to black and shading all pointers found in the object
+//    (which in turn may add those pointers to the work queue).
+//
+// 3. Once the global work queue is empty (but local work queue caches
+// may still contain work), GC performs the "mark 2" sub-phase.
+//
+//    a. GC stops all workers, disables local work queue caches,
+//    flushes each P's local work queue cache to the global work queue
+//    cache, and reenables workers.
+//
+//    b. GC again drains the work queue, as in 2d above.
+//
+// 4. Once the work queue is empty, GC performs mark termination.
+//
+//    a. Stop the world.
+//
+//    b. Set gcphase to _GCmarktermination, and disable workers and
+//    assists.
+//
+//    c. Drain any remaining work from the work queue (typically there
+//    will be none).
+//
+//    d. Perform other housekeeping like flushing mcaches.
+//
+// 5. GC performs the sweep phase.
+//
+//    a. Prepare for the sweep phase by setting gcphase to _GCoff,
+//    setting up sweep state and disabling the write barrier.
+//
+//    b. Start the world. From this point on, newly allocated objects
+//    are white, and allocating sweeps spans before use if necessary.
+//
+//    c. GC does concurrent sweeping in the background and in response
+//    to allocation. See description below.
+//
+// 6. When sufficient allocation has taken place, replay the sequence
+// starting with 1 above. See discussion of GC rate below.
+
+// Concurrent sweep.
+//
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// At the end of STW mark termination all spans are marked as "needs sweeping".
+//
+// The background sweeper goroutine simply sweeps spans one-by-one.
+//
+// To avoid requesting more OS memory while there are unswept spans, when a
+// goroutine needs another span, it first attempts to reclaim that much memory
+// by sweeping. When a goroutine needs to allocate a new small-object span, it
+// sweeps small-object spans for the same object size until it frees at least
+// one object. When a goroutine needs to allocate large-object span from heap,
+// it sweeps spans until it frees at least that many pages into heap. There is
+// one case where this may not suffice: if a goroutine sweeps and frees two
+// nonadjacent one-page spans to the heap, it will allocate a new two-page
+// span, but there can still be other one-page unswept spans which could be
+// combined into a two-page span.
+//
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
+
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+
+// Oblets
+//
+// In order to prevent long pauses while scanning large objects and to
+// improve parallelism, the garbage collector breaks up scan jobs for
+// objects larger than maxObletBytes into "oblets" of at most
+// maxObletBytes. When scanning encounters the beginning of a large
+// object, it scans only the first oblet and enqueues the remaining
+// oblets as new scan jobs.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	_DebugGC         = 0
+	_ConcurrentSweep = true
+	_FinBlockSize    = 4 * 1024
+
+	// sweepMinHeapDistance is a lower bound on the heap distance
+	// (in bytes) reserved for concurrent sweeping between GC
+	// cycles. This will be scaled by gcpercent/100.
+	sweepMinHeapDistance = 1024 * 1024
+)
+
+// heapminimum is the minimum heap size at which to trigger GC.
+// For small heaps, this overrides the usual GOGC*live set rule.
+//
+// When there is a very small live set but a lot of allocation, simply
+// collecting when the heap reaches GOGC*live results in many GC
+// cycles and high total per-GC overhead. This minimum amortizes this
+// per-GC overhead while keeping the heap reasonably small.
+//
+// During initialization this is set to 4MB*GOGC/100. In the case of
+// GOGC==0, this will set heapminimum to 0, resulting in constant
+// collection even when the heap size is small, which is useful for
+// debugging.
+var heapminimum uint64 = defaultHeapMinimum
+
+// defaultHeapMinimum is the value of heapminimum for GOGC==100.
+const defaultHeapMinimum = 4 << 20
+
+// Initialized from $GOGC.  GOGC=off means no GC.
+var gcpercent int32
+
+func gcinit() {
+	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+		throw("size of Workbuf is suboptimal")
+	}
+
+	_ = setGCPercent(readgogc())
+	memstats.gc_trigger = heapminimum
+	// Compute the goal heap size based on the trigger:
+	//   trigger = marked * (1 + triggerRatio)
+	//   marked = trigger / (1 + triggerRatio)
+	//   goal = marked * (1 + GOGC/100)
+	//        = trigger / (1 + triggerRatio) * (1 + GOGC/100)
+	memstats.next_gc = uint64(float64(memstats.gc_trigger) / (1 + gcController.triggerRatio) * (1 + float64(gcpercent)/100))
+	if gcpercent < 0 {
+		memstats.next_gc = ^uint64(0)
+	}
+	work.startSema = 1
+	work.markDoneSema = 1
+}
+
+func readgogc() int32 {
+	p := gogetenv("GOGC")
+	if p == "off" {
+		return -1
+	}
+	if n, ok := atoi32(p); ok {
+		return n
+	}
+	return 100
+}
+
+// gcenable is called after the bulk of the runtime initialization,
+// just before we're about to start letting user code run.
+// It kicks off the background sweeper goroutine and enables GC.
+func gcenable() {
+	c := make(chan int, 1)
+	go bgsweep(c)
+	<-c
+	memstats.enablegc = true // now that runtime is initialized, GC is okay
+}
+
+//go:linkname setGCPercent runtime_debug.setGCPercent
+func setGCPercent(in int32) (out int32) {
+	lock(&mheap_.lock)
+	out = gcpercent
+	if in < 0 {
+		in = -1
+	}
+	gcpercent = in
+	heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
+	if gcController.triggerRatio > float64(gcpercent)/100 {
+		gcController.triggerRatio = float64(gcpercent) / 100
+	}
+	// This is either in gcinit or followed by a STW GC, both of
+	// which will reset other stats like memstats.gc_trigger and
+	// memstats.next_gc to appropriate values.
+	unlock(&mheap_.lock)
+	return out
+}
+
+// Garbage collector phase.
+// Indicates to write barrier and synchronization task to perform.
+var gcphase uint32
+
+// The compiler knows about this variable.
+// If you change it, you must change the compiler too.
+var writeBarrier struct {
+	enabled bool    // compiler emits a check of this before calling write barrier
+	pad     [3]byte // compiler uses 32-bit load for "enabled" field
+	needed  bool    // whether we need a write barrier for current GC phase
+	cgo     bool    // whether we need a write barrier for a cgo check
+	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
+}
+
+// gcBlackenEnabled is 1 if mutator assists and background mark
+// workers are allowed to blacken objects. This must only be set when
+// gcphase == _GCmark.
+var gcBlackenEnabled uint32
+
+// gcBlackenPromptly indicates that optimizations that may
+// hide work from the global work queue should be disabled.
+//
+// If gcBlackenPromptly is true, per-P gcWork caches should
+// be flushed immediately and new objects should be allocated black.
+//
+// There is a tension between allocating objects white and
+// allocating them black. If white and the objects die before being
+// marked they can be collected during this GC cycle. On the other
+// hand allocating them black will reduce _GCmarktermination latency
+// since more work is done in the mark phase. This tension is resolved
+// by allocating white until the mark phase is approaching its end and
+// then allocating black for the remainder of the mark phase.
+var gcBlackenPromptly bool
+
+const (
+	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
+	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
+	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
+)
+
+//go:nosplit
+func setGCPhase(x uint32) {
+	atomic.Store(&gcphase, x)
+	writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
+	writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
+}
+
+// gcMarkWorkerMode represents the mode that a concurrent mark worker
+// should operate in.
+//
+// Concurrent marking happens through four different mechanisms. One
+// is mutator assists, which happen in response to allocations and are
+// not scheduled. The other three are variations in the per-P mark
+// workers and are distinguished by gcMarkWorkerMode.
+type gcMarkWorkerMode int
+
+const (
+	// gcMarkWorkerDedicatedMode indicates that the P of a mark
+	// worker is dedicated to running that mark worker. The mark
+	// worker should run without preemption.
+	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
+
+	// gcMarkWorkerFractionalMode indicates that a P is currently
+	// running the "fractional" mark worker. The fractional worker
+	// is necessary when GOMAXPROCS*gcGoalUtilization is not an
+	// integer. The fractional worker should run until it is
+	// preempted and will be scheduled to pick up the fractional
+	// part of GOMAXPROCS*gcGoalUtilization.
+	gcMarkWorkerFractionalMode
+
+	// gcMarkWorkerIdleMode indicates that a P is running the mark
+	// worker because it has nothing else to do. The idle worker
+	// should run until it is preempted and account its time
+	// against gcController.idleMarkTime.
+	gcMarkWorkerIdleMode
+)
+
+// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
+// to use in execution traces.
+var gcMarkWorkerModeStrings = [...]string{
+	"GC (dedicated)",
+	"GC (fractional)",
+	"GC (idle)",
+}
+
+// gcController implements the GC pacing controller that determines
+// when to trigger concurrent garbage collection and how much marking
+// work to do in mutator assists and background marking.
+//
+// It uses a feedback control algorithm to adjust the memstats.gc_trigger
+// trigger based on the heap growth and GC CPU utilization each cycle.
+// This algorithm optimizes for heap growth to match GOGC and for CPU
+// utilization between assist and background marking to be 25% of
+// GOMAXPROCS. The high-level design of this algorithm is documented
+// at https://golang.org/s/go15gcpacing.
+var gcController = gcControllerState{
+	// Initial trigger ratio guess.
+	triggerRatio: 7 / 8.0,
+}
+
+type gcControllerState struct {
+	// scanWork is the total scan work performed this cycle. This
+	// is updated atomically during the cycle. Updates occur in
+	// bounded batches, since it is both written and read
+	// throughout the cycle. At the end of the cycle, this is how
+	// much of the retained heap is scannable.
+	//
+	// Currently this is the bytes of heap scanned. For most uses,
+	// this is an opaque unit of work, but for estimation the
+	// definition is important.
+	scanWork int64
+
+	// bgScanCredit is the scan work credit accumulated by the
+	// concurrent background scan. This credit is accumulated by
+	// the background scan and stolen by mutator assists. This is
+	// updated atomically. Updates occur in bounded batches, since
+	// it is both written and read throughout the cycle.
+	bgScanCredit int64
+
+	// assistTime is the nanoseconds spent in mutator assists
+	// during this cycle. This is updated atomically. Updates
+	// occur in bounded batches, since it is both written and read
+	// throughout the cycle.
+	assistTime int64
+
+	// dedicatedMarkTime is the nanoseconds spent in dedicated
+	// mark workers during this cycle. This is updated atomically
+	// at the end of the concurrent mark phase.
+	dedicatedMarkTime int64
+
+	// fractionalMarkTime is the nanoseconds spent in the
+	// fractional mark worker during this cycle. This is updated
+	// atomically throughout the cycle and will be up-to-date if
+	// the fractional mark worker is not currently running.
+	fractionalMarkTime int64
+
+	// idleMarkTime is the nanoseconds spent in idle marking
+	// during this cycle. This is updated atomically throughout
+	// the cycle.
+	idleMarkTime int64
+
+	// markStartTime is the absolute start time in nanoseconds
+	// that assists and background mark workers started.
+	markStartTime int64
+
+	// dedicatedMarkWorkersNeeded is the number of dedicated mark
+	// workers that need to be started. This is computed at the
+	// beginning of each cycle and decremented atomically as
+	// dedicated mark workers get started.
+	dedicatedMarkWorkersNeeded int64
+
+	// assistWorkPerByte is the ratio of scan work to allocated
+	// bytes that should be performed by mutator assists. This is
+	// computed at the beginning of each cycle and updated every
+	// time heap_scan is updated.
+	assistWorkPerByte float64
+
+	// assistBytesPerWork is 1/assistWorkPerByte.
+	assistBytesPerWork float64
+
+	// fractionalUtilizationGoal is the fraction of wall clock
+	// time that should be spent in the fractional mark worker.
+	// For example, if the overall mark utilization goal is 25%
+	// and GOMAXPROCS is 6, one P will be a dedicated mark worker
+	// and this will be set to 0.5 so that 50% of the time some P
+	// is in a fractional mark worker. This is computed at the
+	// beginning of each cycle.
+	fractionalUtilizationGoal float64
+
+	// triggerRatio is the heap growth ratio at which the garbage
+	// collection cycle should start. E.g., if this is 0.6, then
+	// GC should start when the live heap has reached 1.6 times
+	// the heap size marked by the previous cycle. This should be
+	// ≤ GOGC/100 so the trigger heap size is less than the goal
+	// heap size. This is updated at the end of of each cycle.
+	triggerRatio float64
+
+	_ [sys.CacheLineSize]byte
+
+	// fractionalMarkWorkersNeeded is the number of fractional
+	// mark workers that need to be started. This is either 0 or
+	// 1. This is potentially updated atomically at every
+	// scheduling point (hence it gets its own cache line).
+	fractionalMarkWorkersNeeded int64
+
+	_ [sys.CacheLineSize]byte
+}
+
+// startCycle resets the GC controller's state and computes estimates
+// for a new GC cycle. The caller must hold worldsema.
+func (c *gcControllerState) startCycle() {
+	c.scanWork = 0
+	c.bgScanCredit = 0
+	c.assistTime = 0
+	c.dedicatedMarkTime = 0
+	c.fractionalMarkTime = 0
+	c.idleMarkTime = 0
+
+	// If this is the first GC cycle or we're operating on a very
+	// small heap, fake heap_marked so it looks like gc_trigger is
+	// the appropriate growth from heap_marked, even though the
+	// real heap_marked may not have a meaningful value (on the
+	// first cycle) or may be much smaller (resulting in a large
+	// error response).
+	if memstats.gc_trigger <= heapminimum {
+		memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + c.triggerRatio))
+	}
+
+	// Re-compute the heap goal for this cycle in case something
+	// changed. This is the same calculation we use elsewhere.
+	memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
+	if gcpercent < 0 {
+		memstats.next_gc = ^uint64(0)
+	}
+
+	// Ensure that the heap goal is at least a little larger than
+	// the current live heap size. This may not be the case if GC
+	// start is delayed or if the allocation that pushed heap_live
+	// over gc_trigger is large or if the trigger is really close to
+	// GOGC. Assist is proportional to this distance, so enforce a
+	// minimum distance, even if it means going over the GOGC goal
+	// by a tiny bit.
+	if memstats.next_gc < memstats.heap_live+1024*1024 {
+		memstats.next_gc = memstats.heap_live + 1024*1024
+	}
+
+	// Compute the total mark utilization goal and divide it among
+	// dedicated and fractional workers.
+	totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal)
+	c.fractionalUtilizationGoal = totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)
+	if c.fractionalUtilizationGoal > 0 {
+		c.fractionalMarkWorkersNeeded = 1
+	} else {
+		c.fractionalMarkWorkersNeeded = 0
+	}
+
+	// Clear per-P state
+	for _, p := range &allp {
+		if p == nil {
+			break
+		}
+		p.gcAssistTime = 0
+	}
+
+	// Compute initial values for controls that are updated
+	// throughout the cycle.
+	c.revise()
+
+	if debug.gcpacertrace > 0 {
+		print("pacer: assist ratio=", c.assistWorkPerByte,
+			" (scan ", memstats.heap_scan>>20, " MB in ",
+			work.initialHeapLive>>20, "->",
+			memstats.next_gc>>20, " MB)",
+			" workers=", c.dedicatedMarkWorkersNeeded,
+			"+", c.fractionalMarkWorkersNeeded, "\n")
+	}
+}
+
+// revise updates the assist ratio during the GC cycle to account for
+// improved estimates. This should be called either under STW or
+// whenever memstats.heap_scan or memstats.heap_live is updated (with
+// mheap_.lock held).
+//
+// It should only be called when gcBlackenEnabled != 0 (because this
+// is when assists are enabled and the necessary statistics are
+// available).
+//
+// TODO: Consider removing the periodic controller update altogether.
+// Since we switched to allocating black, in theory we shouldn't have
+// to change the assist ratio. However, this is still a useful hook
+// that we've found many uses for when experimenting.
+func (c *gcControllerState) revise() {
+	// Compute the expected scan work remaining.
+	//
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heap_scan) and scan work completed
+	// (scanWork), so this difference won't be changed by
+	// allocations during GC.
+	//
+	// This particular estimate is a strict upper bound on the
+	// possible remaining scan work for the current heap.
+	// You might consider dividing this by 2 (or by
+	// (100+GOGC)/100) to counter this over-estimation, but
+	// benchmarks show that this has almost no effect on mean
+	// mutator utilization, heap size, or assist time and it
+	// introduces the danger of under-estimating and letting the
+	// mutator outpace the garbage collector.
+	scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
+	if scanWorkExpected < 1000 {
+		// We set a somewhat arbitrary lower bound on
+		// remaining scan work since if we aim a little high,
+		// we can miss by a little.
+		//
+		// We *do* need to enforce that this is at least 1,
+		// since marking is racy and double-scanning objects
+		// may legitimately make the expected scan work
+		// negative.
+		scanWorkExpected = 1000
+	}
+
+	// Compute the heap distance remaining.
+	heapDistance := int64(memstats.next_gc) - int64(memstats.heap_live)
+	if heapDistance <= 0 {
+		// This shouldn't happen, but if it does, avoid
+		// dividing by zero or setting the assist negative.
+		heapDistance = 1
+	}
+
+	// Compute the mutator assist ratio so by the time the mutator
+	// allocates the remaining heap bytes up to next_gc, it will
+	// have done (or stolen) the remaining amount of scan work.
+	c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
+	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
+}
+
+// endCycle updates the GC controller state at the end of the
+// concurrent part of the GC cycle.
+func (c *gcControllerState) endCycle() {
+	h_t := c.triggerRatio // For debugging
+
+	// Proportional response gain for the trigger controller. Must
+	// be in [0, 1]. Lower values smooth out transient effects but
+	// take longer to respond to phase changes. Higher values
+	// react to phase changes quickly, but are more affected by
+	// transient changes. Values near 1 may be unstable.
+	const triggerGain = 0.5
+
+	// Compute next cycle trigger ratio. First, this computes the
+	// "error" for this cycle; that is, how far off the trigger
+	// was from what it should have been, accounting for both heap
+	// growth and GC CPU utilization. We compute the actual heap
+	// growth during this cycle and scale that by how far off from
+	// the goal CPU utilization we were (to estimate the heap
+	// growth if we had the desired CPU utilization). The
+	// difference between this estimate and the GOGC-based goal
+	// heap growth is the error.
+	goalGrowthRatio := float64(gcpercent) / 100
+	actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
+	assistDuration := nanotime() - c.markStartTime
+
+	// Assume background mark hit its utilization goal.
+	utilization := gcGoalUtilization
+	// Add assist utilization; avoid divide by zero.
+	if assistDuration > 0 {
+		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
+	}
+
+	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+
+	// Finally, we adjust the trigger for next time by this error,
+	// damped by the proportional gain.
+	c.triggerRatio += triggerGain * triggerError
+	if c.triggerRatio < 0 {
+		// This can happen if the mutator is allocating very
+		// quickly or the GC is scanning very slowly.
+		c.triggerRatio = 0
+	} else if c.triggerRatio > goalGrowthRatio*0.95 {
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		c.triggerRatio = goalGrowthRatio * 0.95
+	}
+
+	if debug.gcpacertrace > 0 {
+		// Print controller state in terms of the design
+		// document.
+		H_m_prev := memstats.heap_marked
+		H_T := memstats.gc_trigger
+		h_a := actualGrowthRatio
+		H_a := memstats.heap_live
+		h_g := goalGrowthRatio
+		H_g := int64(float64(H_m_prev) * (1 + h_g))
+		u_a := utilization
+		u_g := gcGoalUtilization
+		W_a := c.scanWork
+		print("pacer: H_m_prev=", H_m_prev,
+			" h_t=", h_t, " H_T=", H_T,
+			" h_a=", h_a, " H_a=", H_a,
+			" h_g=", h_g, " H_g=", H_g,
+			" u_a=", u_a, " u_g=", u_g,
+			" W_a=", W_a,
+			" goalΔ=", goalGrowthRatio-h_t,
+			" actualΔ=", h_a-h_t,
+			" u_a/u_g=", u_a/u_g,
+			"\n")
+	}
+}
+
+// enlistWorker encourages another dedicated mark worker to start on
+// another P if there are spare worker slots. It is used by putfull
+// when more work is made available.
+//
+//go:nowritebarrier
+func (c *gcControllerState) enlistWorker() {
+	// If there are idle Ps, wake one so it will run an idle worker.
+	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
+	//
+	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+	//		wakep()
+	//		return
+	//	}
+
+	// There are no idle Ps. If we need more dedicated workers,
+	// try to preempt a running P so it will switch to a worker.
+	if c.dedicatedMarkWorkersNeeded <= 0 {
+		return
+	}
+	// Pick a random other P to preempt.
+	if gomaxprocs <= 1 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.m == nil || gp.m.p == 0 {
+		return
+	}
+	myID := gp.m.p.ptr().id
+	for tries := 0; tries < 5; tries++ {
+		id := int32(fastrand() % uint32(gomaxprocs-1))
+		if id >= myID {
+			id++
+		}
+		p := allp[id]
+		if p.status != _Prunning {
+			continue
+		}
+		if preemptone(p) {
+			return
+		}
+	}
+}
+
+// findRunnableGCWorker returns the background mark worker for _p_ if it
+// should be run. This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
+	if gcBlackenEnabled == 0 {
+		throw("gcControllerState.findRunnable: blackening not enabled")
+	}
+	if _p_.gcBgMarkWorker == 0 {
+		// The mark worker associated with this P is blocked
+		// performing a mark transition. We can't run it
+		// because it may be on some other run or wait queue.
+		return nil
+	}
+
+	if !gcMarkWorkAvailable(_p_) {
+		// No work to be done right now. This can happen at
+		// the end of the mark phase when there are still
+		// assists tapering off. Don't bother running a worker
+		// now because it'll just return immediately.
+		return nil
+	}
+
+	decIfPositive := func(ptr *int64) bool {
+		if *ptr > 0 {
+			if atomic.Xaddint64(ptr, -1) >= 0 {
+				return true
+			}
+			// We lost a race
+			atomic.Xaddint64(ptr, +1)
+		}
+		return false
+	}
+
+	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
+		// This P is now dedicated to marking until the end of
+		// the concurrent mark phase.
+		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+		// TODO(austin): This P isn't going to run anything
+		// else for a while, so kick everything out of its run
+		// queue.
+	} else {
+		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
+			// No more workers are need right now.
+			return nil
+		}
+
+		// This P has picked the token for the fractional worker.
+		// Is the GC currently under or at the utilization goal?
+		// If so, do more work.
+		//
+		// We used to check whether doing one time slice of work
+		// would remain under the utilization goal, but that has the
+		// effect of delaying work until the mutator has run for
+		// enough time slices to pay for the work. During those time
+		// slices, write barriers are enabled, so the mutator is running slower.
+		// Now instead we do the work whenever we're under or at the
+		// utilization work and pay for it by letting the mutator run later.
+		// This doesn't change the overall utilization averages, but it
+		// front loads the GC work so that the GC finishes earlier and
+		// write barriers can be turned off sooner, effectively giving
+		// the mutator a faster machine.
+		//
+		// The old, slower behavior can be restored by setting
+		//	gcForcePreemptNS = forcePreemptNS.
+		const gcForcePreemptNS = 0
+
+		// TODO(austin): We could fast path this and basically
+		// eliminate contention on c.fractionalMarkWorkersNeeded by
+		// precomputing the minimum time at which it's worth
+		// next scheduling the fractional worker. Then Ps
+		// don't have to fight in the window where we've
+		// passed that deadline and no one has started the
+		// worker yet.
+		//
+		// TODO(austin): Shorter preemption interval for mark
+		// worker to improve fairness and give this
+		// finer-grained control over schedule?
+		now := nanotime() - gcController.markStartTime
+		then := now + gcForcePreemptNS
+		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
+		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
+			// Nope, we'd overshoot the utilization goal
+			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+			return nil
+		}
+		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
+	}
+
+	// Run the background mark worker
+	gp := _p_.gcBgMarkWorker.ptr()
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	if trace.enabled {
+		traceGoUnpark(gp, 0)
+	}
+	return gp
+}
+
+// gcGoalUtilization is the goal CPU utilization for background
+// marking as a fraction of GOMAXPROCS.
+const gcGoalUtilization = 0.25
+
+// gcCreditSlack is the amount of scan work credit that can can
+// accumulate locally before updating gcController.scanWork and,
+// optionally, gcController.bgScanCredit. Lower values give a more
+// accurate assist ratio and make it more likely that assists will
+// successfully steal background credit. Higher values reduce memory
+// contention.
+const gcCreditSlack = 2000
+
+// gcAssistTimeSlack is the nanoseconds of mutator assist time that
+// can accumulate on a P before updating gcController.assistTime.
+const gcAssistTimeSlack = 5000
+
+// gcOverAssistWork determines how many extra units of scan work a GC
+// assist does when an assist happens. This amortizes the cost of an
+// assist by pre-paying for this many bytes of future allocations.
+const gcOverAssistWork = 64 << 10
+
+var work struct {
+	full  uint64                   // lock-free list of full blocks workbuf
+	empty uint64                   // lock-free list of empty blocks workbuf
+	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+
+	// bytesMarked is the number of bytes marked this cycle. This
+	// includes bytes blackened in scanned objects, noscan objects
+	// that go straight to black, and permagrey objects scanned by
+	// markroot during the concurrent scan phase. This is updated
+	// atomically during the cycle. Updates may be batched
+	// arbitrarily, since the value is only read at the end of the
+	// cycle.
+	//
+	// Because of benign races during marking, this number may not
+	// be the exact number of marked bytes, but it should be very
+	// close.
+	//
+	// Put this field here because it needs 64-bit atomic access
+	// (and thus 8-byte alignment even on 32-bit architectures).
+	bytesMarked uint64
+
+	markrootNext uint32 // next markroot job
+	markrootJobs uint32 // number of markroot jobs
+
+	nproc   uint32
+	tstart  int64
+	nwait   uint32
+	ndone   uint32
+	alldone note
+
+	// helperDrainBlock indicates that GC mark termination helpers
+	// should pass gcDrainBlock to gcDrain to block in the
+	// getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
+	//
+	// TODO: This is a temporary fallback to support
+	// debug.gcrescanstacks > 0 and to work around some known
+	// races. Remove this when we remove the debug option and fix
+	// the races.
+	helperDrainBlock bool
+
+	// Number of roots of various root types. Set by gcMarkRootPrepare.
+	nFlushCacheRoots                                  int
+	nDataRoots, nSpanRoots, nStackRoots, nRescanRoots int
+
+	// markrootDone indicates that roots have been marked at least
+	// once during the current GC cycle. This is checked by root
+	// marking operations that have to happen only during the
+	// first root marking pass, whether that's during the
+	// concurrent mark phase in current GC or mark termination in
+	// STW GC.
+	markrootDone bool
+
+	// Each type of GC state transition is protected by a lock.
+	// Since multiple threads can simultaneously detect the state
+	// transition condition, any thread that detects a transition
+	// condition must acquire the appropriate transition lock,
+	// re-check the transition condition and return if it no
+	// longer holds or perform the transition if it does.
+	// Likewise, any transition must invalidate the transition
+	// condition before releasing the lock. This ensures that each
+	// transition is performed by exactly one thread and threads
+	// that need the transition to happen block until it has
+	// happened.
+	//
+	// startSema protects the transition from "off" to mark or
+	// mark termination.
+	startSema uint32
+	// markDoneSema protects transitions from mark 1 to mark 2 and
+	// from mark 2 to mark termination.
+	markDoneSema uint32
+
+	bgMarkReady note   // signal background mark worker has started
+	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
+	// Background mark completion signaling
+
+	// mode is the concurrency mode of the current GC cycle.
+	mode gcMode
+
+	// totaltime is the CPU nanoseconds spent in GC since the
+	// program started if debug.gctrace > 0.
+	totaltime int64
+
+	// initialHeapLive is the value of memstats.heap_live at the
+	// beginning of this GC cycle.
+	initialHeapLive uint64
+
+	// assistQueue is a queue of assists that are blocked because
+	// there was neither enough credit to steal or enough work to
+	// do.
+	assistQueue struct {
+		lock       mutex
+		head, tail guintptr
+	}
+
+	// rescan is a list of G's that need to be rescanned during
+	// mark termination. A G adds itself to this list when it
+	// first invalidates its stack scan.
+	rescan struct {
+		lock mutex
+		list []guintptr
+	}
+
+	// Timing/utilization stats for this cycle.
+	stwprocs, maxprocs                 int32
+	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
+
+	pauseNS    int64 // total STW time this cycle
+	pauseStart int64 // nanotime() of last STW
+
+	// debug.gctrace heap sizes for this cycle.
+	heap0, heap1, heap2, heapGoal uint64
+}
+
+// GC runs a garbage collection and blocks the caller until the
+// garbage collection is complete. It may also block the entire
+// program.
+func GC() {
+	gcStart(gcForceBlockMode, false)
+}
+
+// gcMode indicates how concurrent a GC cycle should be.
+type gcMode int
+
+const (
+	gcBackgroundMode gcMode = iota // concurrent GC and sweep
+	gcForceMode                    // stop-the-world GC now, concurrent sweep
+	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
+)
+
+// gcShouldStart returns true if the exit condition for the _GCoff
+// phase has been met. The exit condition should be tested when
+// allocating.
+//
+// If forceTrigger is true, it ignores the current heap size, but
+// checks all other conditions. In general this should be false.
+func gcShouldStart(forceTrigger bool) bool {
+	return gcphase == _GCoff && (forceTrigger || memstats.heap_live >= memstats.gc_trigger) && memstats.enablegc && panicking == 0 && gcpercent >= 0
+}
+
+// gcStart transitions the GC from _GCoff to _GCmark (if mode ==
+// gcBackgroundMode) or _GCmarktermination (if mode !=
+// gcBackgroundMode) by performing sweep termination and GC
+// initialization.
+//
+// This may return without performing this transition in some cases,
+// such as when called on a system stack or with locks held.
+func gcStart(mode gcMode, forceTrigger bool) {
+	// Since this is called from malloc and malloc is called in
+	// the guts of a number of libraries that might be holding
+	// locks, don't attempt to start GC in non-preemptible or
+	// potentially unstable situations.
+	mp := acquirem()
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
+		releasem(mp)
+		return
+	}
+	releasem(mp)
+	mp = nil
+
+	// Pick up the remaining unswept/not being swept spans concurrently
+	//
+	// This shouldn't happen if we're being invoked in background
+	// mode since proportional sweep should have just finished
+	// sweeping everything, but rounding errors, etc, may leave a
+	// few spans unswept. In forced mode, this is necessary since
+	// GC can be forced at any point in the sweeping cycle.
+	//
+	// We check the transition condition continuously here in case
+	// this G gets delayed in to the next GC cycle.
+	for (mode != gcBackgroundMode || gcShouldStart(forceTrigger)) && gosweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+	}
+
+	// Perform GC initialization and the sweep termination
+	// transition.
+	//
+	// If this is a forced GC, don't acquire the transition lock
+	// or re-check the transition condition because we
+	// specifically *don't* want to share the transition with
+	// another thread.
+	useStartSema := mode == gcBackgroundMode
+	if useStartSema {
+		semacquire(&work.startSema, 0)
+		// Re-check transition condition under transition lock.
+		if !gcShouldStart(forceTrigger) {
+			semrelease(&work.startSema)
+			return
+		}
+	}
+
+	// For stats, check if this GC was forced by the user.
+	forced := mode != gcBackgroundMode
+
+	// In gcstoptheworld debug mode, upgrade the mode accordingly.
+	// We do this after re-checking the transition condition so
+	// that multiple goroutines that detect the heap trigger don't
+	// start multiple STW GCs.
+	if mode == gcBackgroundMode {
+		if debug.gcstoptheworld == 1 {
+			mode = gcForceMode
+		} else if debug.gcstoptheworld == 2 {
+			mode = gcForceBlockMode
+		}
+	}
+
+	// Ok, we're doing it!  Stop everybody else
+	semacquire(&worldsema, 0)
+
+	if trace.enabled {
+		traceGCStart()
+	}
+
+	if mode == gcBackgroundMode {
+		gcBgMarkStartWorkers()
+	}
+
+	gcResetMarkState()
+
+	now := nanotime()
+	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
+	work.tSweepTerm = now
+	work.heap0 = memstats.heap_live
+	work.pauseNS = 0
+	work.mode = mode
+
+	work.pauseStart = now
+	systemstack(stopTheWorldWithSema)
+	// Finish sweep before we start concurrent scan.
+	systemstack(func() {
+		finishsweep_m()
+	})
+	// clearpools before we start the GC. If we wait they memory will not be
+	// reclaimed until the next GC cycle.
+	clearpools()
+
+	if mode == gcBackgroundMode { // Do as much work concurrently as possible
+		gcController.startCycle()
+		work.heapGoal = memstats.next_gc
+
+		// Enter concurrent mark phase and enable
+		// write barriers.
+		//
+		// Because the world is stopped, all Ps will
+		// observe that write barriers are enabled by
+		// the time we start the world and begin
+		// scanning.
+		//
+		// It's necessary to enable write barriers
+		// during the scan phase for several reasons:
+		//
+		// They must be enabled for writes to higher
+		// stack frames before we scan stacks and
+		// install stack barriers because this is how
+		// we track writes to inactive stack frames.
+		// (Alternatively, we could not install stack
+		// barriers over frame boundaries with
+		// up-pointers).
+		//
+		// They must be enabled before assists are
+		// enabled because they must be enabled before
+		// any non-leaf heap objects are marked. Since
+		// allocations are blocked until assists can
+		// happen, we want enable assists as early as
+		// possible.
+		setGCPhase(_GCmark)
+
+		gcBgMarkPrepare() // Must happen before assist enable.
+		gcMarkRootPrepare()
+
+		// Mark all active tinyalloc blocks. Since we're
+		// allocating from these, they need to be black like
+		// other allocations. The alternative is to blacken
+		// the tiny block on every allocation from it, which
+		// would slow down the tiny allocator.
+		gcMarkTinyAllocs()
+
+		// At this point all Ps have enabled the write
+		// barrier, thus maintaining the no white to
+		// black invariant. Enable mutator assists to
+		// put back-pressure on fast allocating
+		// mutators.
+		atomic.Store(&gcBlackenEnabled, 1)
+
+		// Assists and workers can start the moment we start
+		// the world.
+		gcController.markStartTime = now
+
+		// Concurrent mark.
+		systemstack(startTheWorldWithSema)
+		now = nanotime()
+		work.pauseNS += now - work.pauseStart
+		work.tMark = now
+	} else {
+		t := nanotime()
+		work.tMark, work.tMarkTerm = t, t
+		work.heapGoal = work.heap0
+
+		if forced {
+			memstats.numforcedgc++
+		}
+
+		// Perform mark termination. This will restart the world.
+		gcMarkTermination()
+	}
+
+	if useStartSema {
+		semrelease(&work.startSema)
+	}
+}
+
+// gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
+// to mark termination.
+//
+// This should be called when all mark work has been drained. In mark
+// 1, this includes all root marking jobs, global work buffers, and
+// active work buffers in assists and background workers; however,
+// work may still be cached in per-P work buffers. In mark 2, per-P
+// caches are disabled.
+//
+// The calling context must be preemptible.
+//
+// Note that it is explicitly okay to have write barriers in this
+// function because completion of concurrent mark is best-effort
+// anyway. Any work created by write barriers here will be cleaned up
+// by mark termination.
+func gcMarkDone() {
+top:
+	semacquire(&work.markDoneSema, 0)
+
+	// Re-check transition condition under transition lock.
+	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
+		semrelease(&work.markDoneSema)
+		return
+	}
+
+	// Disallow starting new workers so that any remaining workers
+	// in the current mark phase will drain out.
+	//
+	// TODO(austin): Should dedicated workers keep an eye on this
+	// and exit gcDrain promptly?
+	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
+	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+
+	if !gcBlackenPromptly {
+		// Transition from mark 1 to mark 2.
+		//
+		// The global work list is empty, but there can still be work
+		// sitting in the per-P work caches.
+		// Flush and disable work caches.
+
+		// Disallow caching workbufs and indicate that we're in mark 2.
+		gcBlackenPromptly = true
+
+		// Prevent completion of mark 2 until we've flushed
+		// cached workbufs.
+		atomic.Xadd(&work.nwait, -1)
+
+		// GC is set up for mark 2. Let Gs blocked on the
+		// transition lock go while we flush caches.
+		semrelease(&work.markDoneSema)
+
+		systemstack(func() {
+			// Flush all currently cached workbufs and
+			// ensure all Ps see gcBlackenPromptly. This
+			// also blocks until any remaining mark 1
+			// workers have exited their loop so we can
+			// start new mark 2 workers.
+			forEachP(func(_p_ *p) {
+				_p_.gcw.dispose()
+			})
+		})
+
+		// Check that roots are marked. We should be able to
+		// do this before the forEachP, but based on issue
+		// #16083 there may be a (harmless) race where we can
+		// enter mark 2 while some workers are still scanning
+		// stacks. The forEachP ensures these scans are done.
+		//
+		// TODO(austin): Figure out the race and fix this
+		// properly.
+		gcMarkRootCheck()
+
+		// Now we can start up mark 2 workers.
+		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
+		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
+
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// This loop will make progress because
+			// gcBlackenPromptly is now true, so it won't
+			// take this same "if" branch.
+			goto top
+		}
+	} else {
+		// Transition to mark termination.
+		now := nanotime()
+		work.tMarkTerm = now
+		work.pauseStart = now
+		getg().m.preemptoff = "gcing"
+		systemstack(stopTheWorldWithSema)
+		// The gcphase is _GCmark, it will transition to _GCmarktermination
+		// below. The important thing is that the wb remains active until
+		// all marking is complete. This includes writes made by the GC.
+
+		// Record that one root marking pass has completed.
+		work.markrootDone = true
+
+		// Disable assists and background workers. We must do
+		// this before waking blocked assists.
+		atomic.Store(&gcBlackenEnabled, 0)
+
+		// Wake all blocked assists. These will run when we
+		// start the world again.
+		gcWakeAllAssists()
+
+		// Likewise, release the transition lock. Blocked
+		// workers and assists will run when we start the
+		// world again.
+		semrelease(&work.markDoneSema)
+
+		// endCycle depends on all gcWork cache stats being
+		// flushed. This is ensured by mark 2.
+		gcController.endCycle()
+
+		// Perform mark termination. This will restart the world.
+		gcMarkTermination()
+	}
+}
+
+func gcMarkTermination() {
+	// World is stopped.
+	// Start marktermination which includes enabling the write barrier.
+	atomic.Store(&gcBlackenEnabled, 0)
+	gcBlackenPromptly = false
+	setGCPhase(_GCmarktermination)
+
+	work.heap1 = memstats.heap_live
+	startTime := nanotime()
+
+	mp := acquirem()
+	mp.preemptoff = "gcing"
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gp := _g_.m.curg
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = "garbage collection"
+
+	// Run gc on the g0 stack. We do this so that the g stack
+	// we're currently running on will no longer change. Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	systemstack(func() {
+		gcMark(startTime)
+		// Must return immediately.
+		// The outer function's stack may have moved
+		// during gcMark (it shrinks stacks, including the
+		// outer function's stack), so we must not refer
+		// to any of its variables. Return back to the
+		// non-system stack to pick up the new addresses
+		// before continuing.
+	})
+
+	systemstack(func() {
+		work.heap2 = work.bytesMarked
+		if debug.gccheckmark > 0 {
+			// Run a full stop-the-world mark using checkmark bits,
+			// to check that we didn't forget to mark anything during
+			// the concurrent mark process.
+			gcResetMarkState()
+			initCheckmarks()
+			gcMark(startTime)
+			clearCheckmarks()
+		}
+
+		// marking is complete so we can turn the write barrier off
+		setGCPhase(_GCoff)
+		gcSweep(work.mode)
+
+		if debug.gctrace > 1 {
+			startTime = nanotime()
+			// The g stacks have been scanned so
+			// they have gcscanvalid==true and gcworkdone==true.
+			// Reset these so that all stacks will be rescanned.
+			gcResetMarkState()
+			finishsweep_m()
+
+			// Still in STW but gcphase is _GCoff, reset to _GCmarktermination
+			// At this point all objects will be found during the gcMark which
+			// does a complete STW mark and object scan.
+			setGCPhase(_GCmarktermination)
+			gcMark(startTime)
+			setGCPhase(_GCoff) // marking is done, turn off wb.
+			gcSweep(work.mode)
+		}
+	})
+
+	_g_.m.traceback = 0
+	casgstatus(gp, _Gwaiting, _Grunning)
+
+	if trace.enabled {
+		traceGCDone()
+	}
+
+	// all done
+	mp.preemptoff = ""
+
+	if gcphase != _GCoff {
+		throw("gc done but gcphase != _GCoff")
+	}
+
+	// Update timing memstats
+	now, unixNow := nanotime(), unixnanotime()
+	work.pauseNS += now - work.pauseStart
+	work.tEnd = now
+	atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
+	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
+	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
+	memstats.pause_total_ns += uint64(work.pauseNS)
+
+	// Update work.totaltime.
+	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
+	// We report idle marking time below, but omit it from the
+	// overall utilization here since it's "free".
+	markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
+	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
+	cycleCpu := sweepTermCpu + markCpu + markTermCpu
+	work.totaltime += cycleCpu
+
+	// Compute overall GC CPU utilization.
+	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
+	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
+
+	memstats.numgc++
+
+	// Reset sweep state.
+	sweep.nbgsweep = 0
+	sweep.npausesweep = 0
+
+	systemstack(startTheWorldWithSema)
+
+	// Update heap profile stats if gcSweep didn't do it. This is
+	// relatively expensive, so we don't want to do it while the
+	// world is stopped, but it needs to happen ASAP after
+	// starting the world to prevent too many allocations from the
+	// next cycle leaking in. It must happen before releasing
+	// worldsema since there are applications that do a
+	// runtime.GC() to update the heap profile and then
+	// immediately collect the profile.
+	if _ConcurrentSweep && work.mode != gcForceBlockMode {
+		mProf_GC()
+	}
+
+	// Print gctrace before dropping worldsema. As soon as we drop
+	// worldsema another cycle could start and smash the stats
+	// we're trying to print.
+	if debug.gctrace > 0 {
+		util := int(memstats.gc_cpu_fraction * 100)
+
+		var sbuf [24]byte
+		printlock()
+		print("gc ", memstats.numgc,
+			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
+			util, "%: ")
+		prev := work.tSweepTerm
+		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
+			if i != 0 {
+				print("+")
+			}
+			print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
+			prev = ns
+		}
+		print(" ms clock, ")
+		for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
+			if i == 2 || i == 3 {
+				// Separate mark time components with /.
+				print("/")
+			} else if i != 0 {
+				print("+")
+			}
+			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
+		}
+		print(" ms cpu, ",
+			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
+			work.heapGoal>>20, " MB goal, ",
+			work.maxprocs, " P")
+		if work.mode != gcBackgroundMode {
+			print(" (forced)")
+		}
+		print("\n")
+		printunlock()
+	}
+
+	semrelease(&worldsema)
+	// Careful: another GC cycle may start now.
+
+	releasem(mp)
+	mp = nil
+
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		Gosched()
+	}
+}
+
+// gcBgMarkStartWorkers prepares background mark worker goroutines.
+// These goroutines will not run until the mark phase, but they must
+// be started while the work is not stopped and from a regular G
+// stack. The caller must hold worldsema.
+func gcBgMarkStartWorkers() {
+	// Background marking is performed by per-P G's. Ensure that
+	// each P has a background GC G.
+	for _, p := range &allp {
+		if p == nil || p.status == _Pdead {
+			break
+		}
+		if p.gcBgMarkWorker == 0 {
+			go gcBgMarkWorker(p)
+			notetsleepg(&work.bgMarkReady, -1)
+			noteclear(&work.bgMarkReady)
+		}
+	}
+}
+
+// gcBgMarkPrepare sets up state for background marking.
+// Mutator assists must not yet be enabled.
+func gcBgMarkPrepare() {
+	// Background marking will stop when the work queues are empty
+	// and there are no more workers (note that, since this is
+	// concurrent, this may be a transient state, but mark
+	// termination will clean it up). Between background workers
+	// and assists, we don't really know how many workers there
+	// will be, so we pretend to have an arbitrarily large number
+	// of workers, almost all of which are "waiting". While a
+	// worker is working it decrements nwait. If nproc == nwait,
+	// there are no workers.
+	work.nproc = ^uint32(0)
+	work.nwait = ^uint32(0)
+}
+
+func gcBgMarkWorker(_p_ *p) {
+	gp := getg()
+
+	type parkInfo struct {
+		m      muintptr // Release this m on park.
+		attach puintptr // If non-nil, attach to this p on park.
+	}
+	// We pass park to a gopark unlock function, so it can't be on
+	// the stack (see gopark). Prevent deadlock from recursively
+	// starting GC by disabling preemption.
+	gp.m.preemptoff = "GC worker init"
+	park := new(parkInfo)
+	gp.m.preemptoff = ""
+
+	park.m.set(acquirem())
+	park.attach.set(_p_)
+	// Inform gcBgMarkStartWorkers that this worker is ready.
+	// After this point, the background mark worker is scheduled
+	// cooperatively by gcController.findRunnable. Hence, it must
+	// never be preempted, as this would put it into _Grunnable
+	// and put it on a run queue. Instead, when the preempt flag
+	// is set, this puts itself into _Gwaiting to be woken up by
+	// gcController.findRunnable at the appropriate time.
+	notewakeup(&work.bgMarkReady)
+
+	for {
+		// Go to sleep until woken by gcController.findRunnable.
+		// We can't releasem yet since even the call to gopark
+		// may be preempted.
+		gopark(func(g *g, parkp unsafe.Pointer) bool {
+			park := (*parkInfo)(parkp)
+
+			// The worker G is no longer running, so it's
+			// now safe to allow preemption.
+			releasem(park.m.ptr())
+
+			// If the worker isn't attached to its P,
+			// attach now. During initialization and after
+			// a phase change, the worker may have been
+			// running on a different P. As soon as we
+			// attach, the owner P may schedule the
+			// worker, so this must be done after the G is
+			// stopped.
+			if park.attach != 0 {
+				p := park.attach.ptr()
+				park.attach.set(nil)
+				// cas the worker because we may be
+				// racing with a new worker starting
+				// on this P.
+				if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
+					// The P got a new worker.
+					// Exit this worker.
+					return false
+				}
+			}
+			return true
+		}, unsafe.Pointer(park), "GC worker (idle)", traceEvGoBlock, 0)
+
+		// Loop until the P dies and disassociates this
+		// worker (the P may later be reused, in which case
+		// it will get a new worker) or we failed to associate.
+		if _p_.gcBgMarkWorker.ptr() != gp {
+			break
+		}
+
+		// Disable preemption so we can use the gcw. If the
+		// scheduler wants to preempt us, we'll stop draining,
+		// dispose the gcw, and then preempt.
+		park.m.set(acquirem())
+
+		if gcBlackenEnabled == 0 {
+			throw("gcBgMarkWorker: blackening not enabled")
+		}
+
+		startTime := nanotime()
+
+		decnwait := atomic.Xadd(&work.nwait, -1)
+		if decnwait == work.nproc {
+			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+			throw("work.nwait was > work.nproc")
+		}
+
+		systemstack(func() {
+			// Mark our goroutine preemptible so its stack
+			// can be scanned. This lets two mark workers
+			// scan each other (otherwise, they would
+			// deadlock). We must not modify anything on
+			// the G stack. However, stack shrinking is
+			// disabled for mark workers, so it is safe to
+			// read from the G stack.
+			casgstatus(gp, _Grunning, _Gwaiting)
+			switch _p_.gcMarkWorkerMode {
+			default:
+				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
+			case gcMarkWorkerDedicatedMode:
+				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
+			case gcMarkWorkerFractionalMode:
+				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+			case gcMarkWorkerIdleMode:
+				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+			}
+			casgstatus(gp, _Gwaiting, _Grunning)
+		})
+
+		// If we are nearing the end of mark, dispose
+		// of the cache promptly. We must do this
+		// before signaling that we're no longer
+		// working so that other workers can't observe
+		// no workers and no work while we have this
+		// cached, and before we compute done.
+		if gcBlackenPromptly {
+			_p_.gcw.dispose()
+		}
+
+		// Account for time.
+		duration := nanotime() - startTime
+		switch _p_.gcMarkWorkerMode {
+		case gcMarkWorkerDedicatedMode:
+			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
+			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
+		case gcMarkWorkerFractionalMode:
+			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
+			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+		case gcMarkWorkerIdleMode:
+			atomic.Xaddint64(&gcController.idleMarkTime, duration)
+		}
+
+		// Was this the last worker and did we run out
+		// of work?
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait > work.nproc {
+			println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
+				"work.nwait=", incnwait, "work.nproc=", work.nproc)
+			throw("work.nwait > work.nproc")
+		}
+
+		// If this worker reached a background mark completion
+		// point, signal the main GC goroutine.
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// Make this G preemptible and disassociate it
+			// as the worker for this P so
+			// findRunnableGCWorker doesn't try to
+			// schedule it.
+			_p_.gcBgMarkWorker.set(nil)
+			releasem(park.m.ptr())
+
+			gcMarkDone()
+
+			// Disable preemption and prepare to reattach
+			// to the P.
+			//
+			// We may be running on a different P at this
+			// point, so we can't reattach until this G is
+			// parked.
+			park.m.set(acquirem())
+			park.attach.set(_p_)
+		}
+	}
+}
+
+// gcMarkWorkAvailable returns true if executing a mark worker
+// on p is potentially useful. p may be nil, in which case it only
+// checks the global sources of work.
+func gcMarkWorkAvailable(p *p) bool {
+	if p != nil && !p.gcw.empty() {
+		return true
+	}
+	if atomic.Load64(&work.full) != 0 {
+		return true // global work available
+	}
+	if work.markrootNext < work.markrootJobs {
+		return true // root scan work available
+	}
+	return false
+}
+
+// gcMark runs the mark (or, for concurrent GC, mark termination)
+// All gcWork caches must be empty.
+// STW is in effect at this point.
+//TODO go:nowritebarrier
+func gcMark(start_time int64) {
+	if debug.allocfreetrace > 0 {
+		tracegc()
+	}
+
+	if gcphase != _GCmarktermination {
+		throw("in gcMark expecting to see gcphase as _GCmarktermination")
+	}
+	work.tstart = start_time
+
+	// Queue root marking jobs.
+	gcMarkRootPrepare()
+
+	work.nwait = 0
+	work.ndone = 0
+	work.nproc = uint32(gcprocs())
+
+	if debug.gcrescanstacks == 0 && work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots+work.nRescanRoots == 0 {
+		// There's no work on the work queue and no root jobs
+		// that can produce work, so don't bother entering the
+		// getfull() barrier.
+		//
+		// With the hybrid barrier enabled, this will be the
+		// situation the vast majority of the time after
+		// concurrent mark. However, we still need a fallback
+		// for STW GC and because there are some known races
+		// that occasionally leave work around for mark
+		// termination.
+		//
+		// We're still hedging our bets here: if we do
+		// accidentally produce some work, we'll still process
+		// it, just not necessarily in parallel.
+		//
+		// TODO(austin): When we eliminate
+		// debug.gcrescanstacks: fix the races, and remove
+		// work draining from mark termination so we don't
+		// need the fallback path.
+		work.helperDrainBlock = false
+	} else {
+		work.helperDrainBlock = true
+	}
+
+	if trace.enabled {
+		traceGCScanStart()
+	}
+
+	if work.nproc > 1 {
+		noteclear(&work.alldone)
+		helpgc(int32(work.nproc))
+	}
+
+	gchelperstart()
+
+	gcw := &getg().m.p.ptr().gcw
+	if work.helperDrainBlock {
+		gcDrain(gcw, gcDrainBlock)
+	} else {
+		gcDrain(gcw, gcDrainNoBlock)
+	}
+	gcw.dispose()
+
+	if debug.gccheckmark > 0 {
+		// This is expensive when there's a large number of
+		// Gs, so only do it if checkmark is also enabled.
+		gcMarkRootCheck()
+	}
+	if work.full != 0 {
+		throw("work.full != 0")
+	}
+
+	if work.nproc > 1 {
+		notesleep(&work.alldone)
+	}
+
+	// Record that at least one root marking pass has completed.
+	work.markrootDone = true
+
+	// Double-check that all gcWork caches are empty. This should
+	// be ensured by mark 2 before we enter mark termination.
+	for i := 0; i < int(gomaxprocs); i++ {
+		gcw := &allp[i].gcw
+		if !gcw.empty() {
+			throw("P has cached GC work at end of mark termination")
+		}
+		if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
+			throw("P has unflushed stats at end of mark termination")
+		}
+	}
+
+	if trace.enabled {
+		traceGCScanDone()
+	}
+
+	cachestats()
+
+	// Update the marked heap stat.
+	memstats.heap_marked = work.bytesMarked
+
+	// Trigger the next GC cycle when the allocated heap has grown
+	// by triggerRatio over the marked heap size. Assume that
+	// we're in steady state, so the marked heap size is the
+	// same now as it was at the beginning of the GC cycle.
+	memstats.gc_trigger = uint64(float64(memstats.heap_marked) * (1 + gcController.triggerRatio))
+	if memstats.gc_trigger < heapminimum {
+		memstats.gc_trigger = heapminimum
+	}
+	if int64(memstats.gc_trigger) < 0 {
+		print("next_gc=", memstats.next_gc, " bytesMarked=", work.bytesMarked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "\n")
+		throw("gc_trigger underflow")
+	}
+
+	// Update other GC heap size stats. This must happen after
+	// cachestats (which flushes local statistics to these) and
+	// flushallmcaches (which modifies heap_live).
+	memstats.heap_live = work.bytesMarked
+	memstats.heap_scan = uint64(gcController.scanWork)
+
+	minTrigger := memstats.heap_live + sweepMinHeapDistance*uint64(gcpercent)/100
+	if memstats.gc_trigger < minTrigger {
+		// The allocated heap is already past the trigger.
+		// This can happen if the triggerRatio is very low and
+		// the marked heap is less than the live heap size.
+		//
+		// Concurrent sweep happens in the heap growth from
+		// heap_live to gc_trigger, so bump gc_trigger up to ensure
+		// that concurrent sweep has some heap growth in which
+		// to perform sweeping before we start the next GC
+		// cycle.
+		memstats.gc_trigger = minTrigger
+	}
+
+	// The next GC cycle should finish before the allocated heap
+	// has grown by GOGC/100.
+	memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
+	if gcpercent < 0 {
+		memstats.next_gc = ^uint64(0)
+	}
+	if memstats.next_gc < memstats.gc_trigger {
+		memstats.next_gc = memstats.gc_trigger
+	}
+
+	if trace.enabled {
+		traceHeapAlloc()
+		traceNextGC()
+	}
+}
+
+func gcSweep(mode gcMode) {
+	if gcphase != _GCoff {
+		throw("gcSweep being done but phase is not GCoff")
+	}
+
+	lock(&mheap_.lock)
+	mheap_.sweepgen += 2
+	mheap_.sweepdone = 0
+	if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
+		// We should have drained this list during the last
+		// sweep phase. We certainly need to start this phase
+		// with an empty swept list.
+		throw("non-empty swept list")
+	}
+	unlock(&mheap_.lock)
+
+	if !_ConcurrentSweep || mode == gcForceBlockMode {
+		// Special case synchronous sweep.
+		// Record that no proportional sweeping has to happen.
+		lock(&mheap_.lock)
+		mheap_.sweepPagesPerByte = 0
+		mheap_.pagesSwept = 0
+		unlock(&mheap_.lock)
+		// Sweep all spans eagerly.
+		for sweepone() != ^uintptr(0) {
+			sweep.npausesweep++
+		}
+		// Do an additional mProf_GC, because all 'free' events are now real as well.
+		mProf_GC()
+		mProf_GC()
+		return
+	}
+
+	// Concurrent sweep needs to sweep all of the in-use pages by
+	// the time the allocated heap reaches the GC trigger. Compute
+	// the ratio of in-use pages to sweep per byte allocated.
+	heapDistance := int64(memstats.gc_trigger) - int64(memstats.heap_live)
+	// Add a little margin so rounding errors and concurrent
+	// sweep are less likely to leave pages unswept when GC starts.
+	heapDistance -= 1024 * 1024
+	if heapDistance < _PageSize {
+		// Avoid setting the sweep ratio extremely high
+		heapDistance = _PageSize
+	}
+	lock(&mheap_.lock)
+	mheap_.sweepPagesPerByte = float64(mheap_.pagesInUse) / float64(heapDistance)
+	mheap_.pagesSwept = 0
+	mheap_.spanBytesAlloc = 0
+	unlock(&mheap_.lock)
+
+	// Background sweep.
+	lock(&sweep.lock)
+	if sweep.parked {
+		sweep.parked = false
+		ready(sweep.g, 0, true)
+	}
+	unlock(&sweep.lock)
+}
+
+// gcResetMarkState resets global state prior to marking (concurrent
+// or STW) and resets the stack scan state of all Gs.
+//
+// This is safe to do without the world stopped because any Gs created
+// during or after this will start out in the reset state.
+func gcResetMarkState() {
+	// This may be called during a concurrent phase, so make sure
+	// allgs doesn't change.
+	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
+		// Accessing gcRescan is unsafe.
+		throw("bad GC phase")
+	}
+	lock(&allglock)
+	for _, gp := range allgs {
+		gp.gcscandone = false  // set to true in gcphasework
+		gp.gcscanvalid = false // stack has not been scanned
+		gp.gcRescan = -1
+		gp.gcAssistBytes = 0
+	}
+	unlock(&allglock)
+
+	// Clear rescan list.
+	work.rescan.list = work.rescan.list[:0]
+
+	work.bytesMarked = 0
+	work.initialHeapLive = memstats.heap_live
+	work.markrootDone = false
+}
+
+// Hooks for other packages
+
+var poolcleanup func()
+
+//go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
+func sync_runtime_registerPoolCleanup(f func()) {
+	poolcleanup = f
+}
+
+func clearpools() {
+	// clear sync.Pools
+	if poolcleanup != nil {
+		poolcleanup()
+	}
+
+	// Clear central sudog cache.
+	// Leave per-P caches alone, they have strictly bounded size.
+	// Disconnect cached list before dropping it on the floor,
+	// so that a dangling ref to one entry does not pin all of them.
+	lock(&sched.sudoglock)
+	var sg, sgnext *sudog
+	for sg = sched.sudogcache; sg != nil; sg = sgnext {
+		sgnext = sg.next
+		sg.next = nil
+	}
+	sched.sudogcache = nil
+	unlock(&sched.sudoglock)
+
+	// Clear central defer pools.
+	// Leave per-P pools alone, they have strictly bounded size.
+	lock(&sched.deferlock)
+	// disconnect cached list before dropping it on the floor,
+	// so that a dangling ref to one entry does not pin all of them.
+	var d, dlink *_defer
+	for d = sched.deferpool; d != nil; d = dlink {
+		dlink = d.link
+		d.link = nil
+	}
+	sched.deferpool = nil
+	unlock(&sched.deferlock)
+}
+
+// Timing
+
+//go:nowritebarrier
+func gchelper() {
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gchelperstart()
+
+	if trace.enabled {
+		traceGCScanStart()
+	}
+
+	// Parallel mark over GC roots and heap
+	if gcphase == _GCmarktermination {
+		gcw := &_g_.m.p.ptr().gcw
+		if work.helperDrainBlock {
+			gcDrain(gcw, gcDrainBlock) // blocks in getfull
+		} else {
+			gcDrain(gcw, gcDrainNoBlock)
+		}
+		gcw.dispose()
+	}
+
+	if trace.enabled {
+		traceGCScanDone()
+	}
+
+	nproc := work.nproc // work.nproc can change right after we increment work.ndone
+	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
+		notewakeup(&work.alldone)
+	}
+	_g_.m.traceback = 0
+}
+
+func gchelperstart() {
+	_g_ := getg()
+
+	if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
+		throw("gchelperstart: bad m->helpgc")
+	}
+	// For gccgo we run gchelper on the normal g stack.
+	// if _g_ != _g_.m.g0 {
+	// 	throw("gchelper not running on g0 stack")
+	// }
+}
+
+// itoaDiv formats val/(10**dec) into buf.
+func itoaDiv(buf []byte, val uint64, dec int) []byte {
+	i := len(buf) - 1
+	idec := i - dec
+	for val >= 10 || i >= idec {
+		buf[i] = byte(val%10 + '0')
+		i--
+		if i == idec {
+			buf[i] = '.'
+			i--
+		}
+		val /= 10
+	}
+	buf[i] = byte(val + '0')
+	return buf[i:]
+}
+
+// fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
+func fmtNSAsMS(buf []byte, ns uint64) []byte {
+	if ns >= 10e6 {
+		// Format as whole milliseconds.
+		return itoaDiv(buf, ns/1e6, 0)
+	}
+	// Format two digits of precision, with at most three decimal places.
+	x := ns / 1e3
+	if x == 0 {
+		buf[0] = '0'
+		return buf[:1]
+	}
+	dec := 3
+	for x >= 100 {
+		x /= 10
+		dec--
+	}
+	return itoaDiv(buf, x, dec)
+}
diff --git a/libgo/go/runtime/mgc_gccgo.go b/libgo/go/runtime/mgc_gccgo.go
new file mode 100644
index 0000000..c1fa154
--- /dev/null
+++ b/libgo/go/runtime/mgc_gccgo.go
@@ -0,0 +1,87 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// gccgo-specific support for GC.
+
+package runtime
+
+import "unsafe"
+
+// gcRoot is a single GC root: a variable plus a ptrmask.
+type gcRoot struct {
+	decl    unsafe.Pointer // Pointer to variable.
+	size    uintptr        // Size of variable.
+	ptrdata uintptr        // Length of gcdata.
+	gcdata  *uint8         // Pointer mask.
+}
+
+// gcRootList is the set of GC roots for a package.
+// The next field is used to put this all into a linked list.
+// count gives the real length of the array.
+type gcRootList struct {
+	next  *gcRootList
+	count int
+	roots [1 << 26]gcRoot
+}
+
+// roots is the list of GC roots for the program.
+// The compiler keeps this variable itself off the list.
+var gcRoots *gcRootList
+
+// registerGCRoots is called by compiler-generated code.
+//go:linkname registerGCRoots runtime.registerGCRoots
+
+// registerGCRoots is called by init functions to register the GC
+// roots for a package.  The init functions are run sequentially at
+// the start of the program, so no locking is needed.
+func registerGCRoots(r *gcRootList) {
+	r.next = gcRoots
+	gcRoots = r
+}
+
+// checkPreempt is called when the preempt field in the running G is true.
+// It preempts the goroutine if it is safe to do so.
+// If preemptscan is true, this scans the stack for the garbage collector
+// and carries on.
+func checkPreempt() {
+	gp := getg()
+	if !gp.preempt || gp != gp.m.curg || gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" {
+		return
+	}
+
+	// Synchronize with scang.
+	gp.scanningself = true
+	casgstatus(gp, _Grunning, _Gwaiting)
+	if gp.preemptscan {
+		for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
+			// Likely to be racing with the GC as
+			// it sees a _Gwaiting and does the
+			// stack scan. If so, gcworkdone will
+			// be set and gcphasework will simply
+			// return.
+		}
+		if !gp.gcscandone {
+			mp := acquirem()
+			gcw := &gp.m.p.ptr().gcw
+			scanstack(gp, gcw)
+			if gcBlackenPromptly {
+				gcw.dispose()
+			}
+			releasem(mp)
+			gp.gcscandone = true
+		}
+		gp.preemptscan = false
+		gp.preempt = false
+		casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
+		// This clears gcscanvalid.
+		casgstatus(gp, _Gwaiting, _Grunning)
+		gp.scanningself = false
+		return
+	}
+
+	// Act like goroutine called runtime.Gosched.
+	casgstatus(gp, _Gwaiting, _Grunning)
+	gp.scanningself = false
+	mcall(gopreempt_m)
+}
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
new file mode 100644
index 0000000..93252ba
--- /dev/null
+++ b/libgo/go/runtime/mgcmark.go
@@ -0,0 +1,1374 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: marking and scanning
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	fixedRootFinalizers = iota
+	fixedRootFreeGStacks
+	fixedRootCount
+
+	// rootBlockBytes is the number of bytes to scan per data or
+	// BSS root.
+	rootBlockBytes = 256 << 10
+
+	// rootBlockSpans is the number of spans to scan per span
+	// root.
+	rootBlockSpans = 8 * 1024 // 64MB worth of spans
+
+	// maxObletBytes is the maximum bytes of an object to scan at
+	// once. Larger objects will be split up into "oblets" of at
+	// most this size. Since we can scan 1–2 MB/ms, 128 KB bounds
+	// scan preemption at ~100 µs.
+	//
+	// This must be > _MaxSmallSize so that the object base is the
+	// span base.
+	maxObletBytes = 128 << 10
+
+	// idleCheckThreshold specifies how many units of work to do
+	// between run queue checks in an idle worker. Assuming a scan
+	// rate of 1 MB/ms, this is ~100 µs. Lower values have higher
+	// overhead in the scan loop (the scheduler check may perform
+	// a syscall, so its overhead is nontrivial). Higher values
+	// make the system less responsive to incoming work.
+	idleCheckThreshold = 100000
+)
+
+// gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
+// some miscellany) and initializes scanning-related state.
+//
+// The caller must have call gcCopySpans().
+//
+// The world must be stopped.
+//
+//go:nowritebarrier
+func gcMarkRootPrepare() {
+	if gcphase == _GCmarktermination {
+		work.nFlushCacheRoots = int(gomaxprocs)
+	} else {
+		work.nFlushCacheRoots = 0
+	}
+
+	work.nDataRoots = 0
+
+	// Only scan globals once per cycle; preferably concurrently.
+	if !work.markrootDone {
+		roots := gcRoots
+		for roots != nil {
+			work.nDataRoots++
+			roots = roots.next
+		}
+	}
+
+	if !work.markrootDone {
+		// On the first markroot, we need to scan span roots.
+		// In concurrent GC, this happens during concurrent
+		// mark and we depend on addfinalizer to ensure the
+		// above invariants for objects that get finalizers
+		// after concurrent mark. In STW GC, this will happen
+		// during mark termination.
+		//
+		// We're only interested in scanning the in-use spans,
+		// which will all be swept at this point. More spans
+		// may be added to this list during concurrent GC, but
+		// we only care about spans that were allocated before
+		// this mark phase.
+		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+
+		// On the first markroot, we need to scan all Gs. Gs
+		// may be created after this point, but it's okay that
+		// we ignore them because they begin life without any
+		// roots, so there's nothing to scan, and any roots
+		// they create during the concurrent phase will be
+		// scanned during mark termination. During mark
+		// termination, allglen isn't changing, so we'll scan
+		// all Gs.
+		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		work.nRescanRoots = 0
+	} else {
+		// We've already scanned span roots and kept the scan
+		// up-to-date during concurrent mark.
+		work.nSpanRoots = 0
+
+		// On the second pass of markroot, we're just scanning
+		// dirty stacks. It's safe to access rescan since the
+		// world is stopped.
+		work.nStackRoots = 0
+		work.nRescanRoots = len(work.rescan.list)
+	}
+
+	work.markrootNext = 0
+	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
+}
+
+// gcMarkRootCheck checks that all roots have been scanned. It is
+// purely for debugging.
+func gcMarkRootCheck() {
+	if work.markrootNext < work.markrootJobs {
+		print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
+		throw("left over markroot jobs")
+	}
+
+	lock(&allglock)
+	// Check that stacks have been scanned.
+	var gp *g
+	if gcphase == _GCmarktermination && debug.gcrescanstacks > 0 {
+		for i := 0; i < len(allgs); i++ {
+			gp = allgs[i]
+			if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
+				goto fail
+			}
+		}
+	} else {
+		for i := 0; i < work.nStackRoots; i++ {
+			gp = allgs[i]
+			if !gp.gcscandone {
+				goto fail
+			}
+		}
+	}
+	unlock(&allglock)
+	return
+
+fail:
+	println("gp", gp, "goid", gp.goid,
+		"status", readgstatus(gp),
+		"gcscandone", gp.gcscandone,
+		"gcscanvalid", gp.gcscanvalid)
+	unlock(&allglock) // Avoid self-deadlock with traceback.
+	throw("scan missed a g")
+}
+
+// ptrmask for an allocation containing a single pointer.
+var oneptrmask = [...]uint8{1}
+
+// markroot scans the i'th root.
+//
+// Preemption must be disabled (because this uses a gcWork).
+//
+// nowritebarrier is only advisory here.
+//
+//go:nowritebarrier
+func markroot(gcw *gcWork, i uint32) {
+	// TODO(austin): This is a bit ridiculous. Compute and store
+	// the bases in gcMarkRootPrepare instead of the counts.
+	baseFlushCache := uint32(fixedRootCount)
+	baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
+	baseSpans := baseData + uint32(work.nDataRoots)
+	baseStacks := baseSpans + uint32(work.nSpanRoots)
+	baseRescan := baseStacks + uint32(work.nStackRoots)
+	end := baseRescan + uint32(work.nRescanRoots)
+
+	// Note: if you add a case here, please also update heapdump.go:dumproots.
+	switch {
+	case baseFlushCache <= i && i < baseData:
+		flushmcache(int(i - baseFlushCache))
+
+	case baseData <= i && i < baseSpans:
+		roots := gcRoots
+		c := baseData
+		for roots != nil {
+			if i == c {
+				markrootBlock(roots, gcw)
+				break
+			}
+			roots = roots.next
+			c++
+		}
+
+	case i == fixedRootFinalizers:
+		for fb := allfin; fb != nil; fb = fb.alllink {
+			cnt := uintptr(atomic.Load(&fb.cnt))
+			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
+		}
+
+	case i == fixedRootFreeGStacks:
+		// FIXME: We don't do this for gccgo.
+
+	case baseSpans <= i && i < baseStacks:
+		// mark MSpan.specials
+		markrootSpans(gcw, int(i-baseSpans))
+
+	default:
+		// the rest is scanning goroutine stacks
+		var gp *g
+		if baseStacks <= i && i < baseRescan {
+			gp = allgs[i-baseStacks]
+		} else if baseRescan <= i && i < end {
+			gp = work.rescan.list[i-baseRescan].ptr()
+			if gp.gcRescan != int32(i-baseRescan) {
+				// Looking for issue #17099.
+				println("runtime: gp", gp, "found at rescan index", i-baseRescan, "but should be at", gp.gcRescan)
+				throw("bad g rescan index")
+			}
+		} else {
+			throw("markroot: bad index")
+		}
+
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		status := readgstatus(gp) // We are not in a scan state
+		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+			gp.waitsince = work.tstart
+		}
+
+		// scang must be done on the system stack in case
+		// we're trying to scan our own stack.
+		systemstack(func() {
+			// If this is a self-scan, put the user G in
+			// _Gwaiting to prevent self-deadlock. It may
+			// already be in _Gwaiting if this is a mark
+			// worker or we're in mark termination.
+			userG := getg().m.curg
+			selfScan := gp == userG && readgstatus(userG) == _Grunning
+			if selfScan {
+				casgstatus(userG, _Grunning, _Gwaiting)
+				userG.waitreason = "garbage collection scan"
+			}
+
+			// TODO: scang blocks until gp's stack has
+			// been scanned, which may take a while for
+			// running goroutines. Consider doing this in
+			// two phases where the first is non-blocking:
+			// we scan the stacks we can and ask running
+			// goroutines to scan themselves; and the
+			// second blocks.
+			scang(gp, gcw)
+
+			if selfScan {
+				casgstatus(userG, _Gwaiting, _Grunning)
+			}
+		})
+	}
+}
+
+// markrootBlock scans one element of the list of GC roots.
+//
+//go:nowritebarrier
+func markrootBlock(roots *gcRootList, gcw *gcWork) {
+	for i := 0; i < roots.count; i++ {
+		r := &roots.roots[i]
+		scanblock(uintptr(r.decl), r.ptrdata, r.gcdata, gcw)
+	}
+}
+
+// markrootSpans marks roots for one shard of work.spans.
+//
+//go:nowritebarrier
+func markrootSpans(gcw *gcWork, shard int) {
+	// Objects with finalizers have two GC-related invariants:
+	//
+	// 1) Everything reachable from the object must be marked.
+	// This ensures that when we pass the object to its finalizer,
+	// everything the finalizer can reach will be retained.
+	//
+	// 2) Finalizer specials (which are not in the garbage
+	// collected heap) are roots. In practice, this means the fn
+	// field must be scanned.
+	//
+	// TODO(austin): There are several ideas for making this more
+	// efficient in issue #11485.
+
+	if work.markrootDone {
+		throw("markrootSpans during second markroot")
+	}
+
+	sg := mheap_.sweepgen
+	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
+	// Note that work.spans may not include spans that were
+	// allocated between entering the scan phase and now. This is
+	// okay because any objects with finalizers in those spans
+	// must have been allocated and given finalizers after we
+	// entered the scan phase, so addfinalizer will have ensured
+	// the above invariants for them.
+	for _, s := range spans {
+		if s.state != mSpanInUse {
+			continue
+		}
+		if !useCheckmark && s.sweepgen != sg {
+			// sweepgen was updated (+2) during non-checkmark GC pass
+			print("sweep ", s.sweepgen, " ", sg, "\n")
+			throw("gc: unswept span")
+		}
+
+		// Speculatively check if there are any specials
+		// without acquiring the span lock. This may race with
+		// adding the first special to a span, but in that
+		// case addfinalizer will observe that the GC is
+		// active (which is globally synchronized) and ensure
+		// the above invariants. We may also ensure the
+		// invariants, but it's okay to scan an object twice.
+		if s.specials == nil {
+			continue
+		}
+
+		// Lock the specials to prevent a special from being
+		// removed from the list while we're traversing it.
+		lock(&s.speciallock)
+
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialFinalizer {
+				continue
+			}
+			// don't mark finalized object, but scan it so we
+			// retain everything it points to.
+			spf := (*specialfinalizer)(unsafe.Pointer(sp))
+			// A finalizer can be set for an inner byte of an object, find object beginning.
+			p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+
+			// Mark everything that can be reached from
+			// the object (but *not* the object itself or
+			// we'll never collect it).
+			scanobject(p, gcw)
+
+			// The special itself is a root.
+			scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+		}
+
+		unlock(&s.speciallock)
+	}
+}
+
+// gcAssistAlloc performs GC work to make gp's assist debt positive.
+// gp must be the calling user gorountine.
+//
+// This must be called with preemption enabled.
+func gcAssistAlloc(gp *g) {
+	// Don't assist in non-preemptible contexts. These are
+	// generally fragile and won't allow the assist to block.
+	if getg() == gp.m.g0 {
+		return
+	}
+	if mp := getg().m; mp.locks > 0 || mp.preemptoff != "" {
+		return
+	}
+
+retry:
+	// Compute the amount of scan work we need to do to make the
+	// balance positive. When the required amount of work is low,
+	// we over-assist to build up credit for future allocations
+	// and amortize the cost of assisting.
+	debtBytes := -gp.gcAssistBytes
+	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
+	if scanWork < gcOverAssistWork {
+		scanWork = gcOverAssistWork
+		debtBytes = int64(gcController.assistBytesPerWork * float64(scanWork))
+	}
+
+	// Steal as much credit as we can from the background GC's
+	// scan credit. This is racy and may drop the background
+	// credit below 0 if two mutators steal at the same time. This
+	// will just cause steals to fail until credit is accumulated
+	// again, so in the long run it doesn't really matter, but we
+	// do have to handle the negative credit case.
+	bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
+	stolen := int64(0)
+	if bgScanCredit > 0 {
+		if bgScanCredit < scanWork {
+			stolen = bgScanCredit
+			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
+		} else {
+			stolen = scanWork
+			gp.gcAssistBytes += debtBytes
+		}
+		atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
+
+		scanWork -= stolen
+
+		if scanWork == 0 {
+			// We were able to steal all of the credit we
+			// needed.
+			return
+		}
+	}
+
+	// Perform assist work
+	systemstack(func() {
+		gcAssistAlloc1(gp, scanWork)
+		// The user stack may have moved, so this can't touch
+		// anything on it until it returns from systemstack.
+	})
+
+	completed := gp.param != nil
+	gp.param = nil
+	if completed {
+		gcMarkDone()
+	}
+
+	if gp.gcAssistBytes < 0 {
+		// We were unable steal enough credit or perform
+		// enough work to pay off the assist debt. We need to
+		// do one of these before letting the mutator allocate
+		// more to prevent over-allocation.
+		//
+		// If this is because we were preempted, reschedule
+		// and try some more.
+		if gp.preempt {
+			Gosched()
+			goto retry
+		}
+
+		// Add this G to an assist queue and park. When the GC
+		// has more background credit, it will satisfy queued
+		// assists before flushing to the global credit pool.
+		//
+		// Note that this does *not* get woken up when more
+		// work is added to the work list. The theory is that
+		// there wasn't enough work to do anyway, so we might
+		// as well let background marking take care of the
+		// work that is available.
+		if !gcParkAssist() {
+			goto retry
+		}
+
+		// At this point either background GC has satisfied
+		// this G's assist debt, or the GC cycle is over.
+	}
+}
+
+// gcAssistAlloc1 is the part of gcAssistAlloc that runs on the system
+// stack. This is a separate function to make it easier to see that
+// we're not capturing anything from the user stack, since the user
+// stack may move while we're in this function.
+//
+// gcAssistAlloc1 indicates whether this assist completed the mark
+// phase by setting gp.param to non-nil. This can't be communicated on
+// the stack since it may move.
+//
+//go:systemstack
+func gcAssistAlloc1(gp *g, scanWork int64) {
+	// Clear the flag indicating that this assist completed the
+	// mark phase.
+	gp.param = nil
+
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		// The gcBlackenEnabled check in malloc races with the
+		// store that clears it but an atomic check in every malloc
+		// would be a performance hit.
+		// Instead we recheck it here on the non-preemptable system
+		// stack to determine if we should preform an assist.
+
+		// GC is done, so ignore any remaining debt.
+		gp.gcAssistBytes = 0
+		return
+	}
+	// Track time spent in this assist. Since we're on the
+	// system stack, this is non-preemptible, so we can
+	// just measure start and end time.
+	startTime := nanotime()
+
+	decnwait := atomic.Xadd(&work.nwait, -1)
+	if decnwait == work.nproc {
+		println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
+		throw("nwait > work.nprocs")
+	}
+
+	// gcDrainN requires the caller to be preemptible.
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = "GC assist marking"
+
+	// drain own cached work first in the hopes that it
+	// will be more cache friendly.
+	gcw := &getg().m.p.ptr().gcw
+	workDone := gcDrainN(gcw, scanWork)
+	// If we are near the end of the mark phase
+	// dispose of the gcw.
+	if gcBlackenPromptly {
+		gcw.dispose()
+	}
+
+	casgstatus(gp, _Gwaiting, _Grunning)
+
+	// Record that we did this much scan work.
+	//
+	// Back out the number of bytes of assist credit that
+	// this scan work counts for. The "1+" is a poor man's
+	// round-up, to ensure this adds credit even if
+	// assistBytesPerWork is very low.
+	gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+
+	// If this is the last worker and we ran out of work,
+	// signal a completion point.
+	incnwait := atomic.Xadd(&work.nwait, +1)
+	if incnwait > work.nproc {
+		println("runtime: work.nwait=", incnwait,
+			"work.nproc=", work.nproc,
+			"gcBlackenPromptly=", gcBlackenPromptly)
+		throw("work.nwait > work.nproc")
+	}
+
+	if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+		// This has reached a background completion point. Set
+		// gp.param to a non-nil value to indicate this. It
+		// doesn't matter what we set it to (it just has to be
+		// a valid pointer).
+		gp.param = unsafe.Pointer(gp)
+	}
+	duration := nanotime() - startTime
+	_p_ := gp.m.p.ptr()
+	_p_.gcAssistTime += duration
+	if _p_.gcAssistTime > gcAssistTimeSlack {
+		atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
+		_p_.gcAssistTime = 0
+	}
+}
+
+// gcWakeAllAssists wakes all currently blocked assists. This is used
+// at the end of a GC cycle. gcBlackenEnabled must be false to prevent
+// new assists from going to sleep after this point.
+func gcWakeAllAssists() {
+	lock(&work.assistQueue.lock)
+	injectglist(work.assistQueue.head.ptr())
+	work.assistQueue.head.set(nil)
+	work.assistQueue.tail.set(nil)
+	unlock(&work.assistQueue.lock)
+}
+
+// gcParkAssist puts the current goroutine on the assist queue and parks.
+//
+// gcParkAssist returns whether the assist is now satisfied. If it
+// returns false, the caller must retry the assist.
+//
+//go:nowritebarrier
+func gcParkAssist() bool {
+	lock(&work.assistQueue.lock)
+	// If the GC cycle finished while we were getting the lock,
+	// exit the assist. The cycle can't finish while we hold the
+	// lock.
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		unlock(&work.assistQueue.lock)
+		return true
+	}
+
+	gp := getg()
+	oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail
+	if oldHead == 0 {
+		work.assistQueue.head.set(gp)
+	} else {
+		oldTail.ptr().schedlink.set(gp)
+	}
+	work.assistQueue.tail.set(gp)
+	gp.schedlink.set(nil)
+
+	// Recheck for background credit now that this G is in
+	// the queue, but can still back out. This avoids a
+	// race in case background marking has flushed more
+	// credit since we checked above.
+	if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
+		work.assistQueue.head = oldHead
+		work.assistQueue.tail = oldTail
+		if oldTail != 0 {
+			oldTail.ptr().schedlink.set(nil)
+		}
+		unlock(&work.assistQueue.lock)
+		return false
+	}
+	// Park.
+	goparkunlock(&work.assistQueue.lock, "GC assist wait", traceEvGoBlockGC, 2)
+	return true
+}
+
+// gcFlushBgCredit flushes scanWork units of background scan work
+// credit. This first satisfies blocked assists on the
+// work.assistQueue and then flushes any remaining credit to
+// gcController.bgScanCredit.
+//
+// Write barriers are disallowed because this is used by gcDrain after
+// it has ensured that all work is drained and this must preserve that
+// condition.
+//
+//go:nowritebarrierrec
+func gcFlushBgCredit(scanWork int64) {
+	if work.assistQueue.head == 0 {
+		// Fast path; there are no blocked assists. There's a
+		// small window here where an assist may add itself to
+		// the blocked queue and park. If that happens, we'll
+		// just get it on the next flush.
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+		return
+	}
+
+	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+
+	lock(&work.assistQueue.lock)
+	gp := work.assistQueue.head.ptr()
+	for gp != nil && scanBytes > 0 {
+		// Note that gp.gcAssistBytes is negative because gp
+		// is in debt. Think carefully about the signs below.
+		if scanBytes+gp.gcAssistBytes >= 0 {
+			// Satisfy this entire assist debt.
+			scanBytes += gp.gcAssistBytes
+			gp.gcAssistBytes = 0
+			xgp := gp
+			gp = gp.schedlink.ptr()
+			// It's important that we *not* put xgp in
+			// runnext. Otherwise, it's possible for user
+			// code to exploit the GC worker's high
+			// scheduler priority to get itself always run
+			// before other goroutines and always in the
+			// fresh quantum started by GC.
+			ready(xgp, 0, false)
+		} else {
+			// Partially satisfy this assist.
+			gp.gcAssistBytes += scanBytes
+			scanBytes = 0
+			// As a heuristic, we move this assist to the
+			// back of the queue so that large assists
+			// can't clog up the assist queue and
+			// substantially delay small assists.
+			xgp := gp
+			gp = gp.schedlink.ptr()
+			if gp == nil {
+				// gp is the only assist in the queue.
+				gp = xgp
+			} else {
+				xgp.schedlink = 0
+				work.assistQueue.tail.ptr().schedlink.set(xgp)
+				work.assistQueue.tail.set(xgp)
+			}
+			break
+		}
+	}
+	work.assistQueue.head.set(gp)
+	if gp == nil {
+		work.assistQueue.tail.set(nil)
+	}
+
+	if scanBytes > 0 {
+		// Convert from scan bytes back to work.
+		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+	}
+	unlock(&work.assistQueue.lock)
+}
+
+// We use a C function to find the stack.
+func doscanstack(*g, *gcWork)
+
+// scanstack scans gp's stack, greying all pointers found on the stack.
+//
+// During mark phase, it also installs stack barriers while traversing
+// gp's stack. During mark termination, it stops scanning when it
+// reaches an unhit stack barrier.
+//
+// scanstack is marked go:systemstack because it must not be preempted
+// while using a workbuf.
+//
+//go:nowritebarrier
+//go:systemstack
+func scanstack(gp *g, gcw *gcWork) {
+	if gp.gcscanvalid {
+		return
+	}
+
+	if readgstatus(gp)&_Gscan == 0 {
+		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
+		throw("scanstack - bad status")
+	}
+
+	switch readgstatus(gp) &^ _Gscan {
+	default:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("mark - bad status")
+	case _Gdead:
+		return
+	case _Grunning:
+		// ok for gccgo, though not for gc.
+	case _Grunnable, _Gsyscall, _Gwaiting:
+		// ok
+	}
+
+	mp := gp.m
+	if mp != nil && mp.helpgc != 0 {
+		throw("can't scan gchelper stack")
+	}
+
+	// Scan the stack.
+	doscanstack(gp, gcw)
+
+	// Conservatively scan the saved register values.
+	scanstackblock(uintptr(unsafe.Pointer(&gp.gcregs)), unsafe.Sizeof(gp.gcregs), gcw)
+	scanstackblock(uintptr(unsafe.Pointer(&gp.context)), unsafe.Sizeof(gp.context), gcw)
+
+	if gcphase == _GCmark {
+		// gp may have added itself to the rescan list between
+		// when GC started and now. It's clean now, so remove
+		// it. This isn't safe during mark termination because
+		// mark termination is consuming this list, but it's
+		// also not necessary.
+		dequeueRescan(gp)
+	}
+	gp.gcscanvalid = true
+}
+
+// queueRescan adds gp to the stack rescan list and clears
+// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
+// already on the rescan list.
+func queueRescan(gp *g) {
+	if debug.gcrescanstacks == 0 {
+		// Clear gcscanvalid to keep assertions happy.
+		//
+		// TODO: Remove gcscanvalid entirely when we remove
+		// stack rescanning.
+		gp.gcscanvalid = false
+		return
+	}
+
+	if gcphase == _GCoff {
+		gp.gcscanvalid = false
+		return
+	}
+	if gp.gcRescan != -1 {
+		throw("g already on rescan list")
+	}
+
+	lock(&work.rescan.lock)
+	gp.gcscanvalid = false
+
+	// Recheck gcphase under the lock in case there was a phase change.
+	if gcphase == _GCoff {
+		unlock(&work.rescan.lock)
+		return
+	}
+	if len(work.rescan.list) == cap(work.rescan.list) {
+		throw("rescan list overflow")
+	}
+	n := len(work.rescan.list)
+	gp.gcRescan = int32(n)
+	work.rescan.list = work.rescan.list[:n+1]
+	work.rescan.list[n].set(gp)
+	unlock(&work.rescan.lock)
+}
+
+// dequeueRescan removes gp from the stack rescan list, if gp is on
+// the rescan list. The caller must own gp.
+func dequeueRescan(gp *g) {
+	if debug.gcrescanstacks == 0 {
+		return
+	}
+
+	if gp.gcRescan == -1 {
+		return
+	}
+	if gcphase == _GCoff {
+		gp.gcRescan = -1
+		return
+	}
+
+	lock(&work.rescan.lock)
+	if work.rescan.list[gp.gcRescan].ptr() != gp {
+		throw("bad dequeueRescan")
+	}
+	// Careful: gp may itself be the last G on the list.
+	last := work.rescan.list[len(work.rescan.list)-1]
+	work.rescan.list[gp.gcRescan] = last
+	last.ptr().gcRescan = gp.gcRescan
+	gp.gcRescan = -1
+	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
+	unlock(&work.rescan.lock)
+}
+
+type gcDrainFlags int
+
+const (
+	gcDrainUntilPreempt gcDrainFlags = 1 << iota
+	gcDrainNoBlock
+	gcDrainFlushBgCredit
+	gcDrainIdle
+
+	// gcDrainBlock means neither gcDrainUntilPreempt or
+	// gcDrainNoBlock. It is the default, but callers should use
+	// the constant for documentation purposes.
+	gcDrainBlock gcDrainFlags = 0
+)
+
+// gcDrain scans roots and objects in work buffers, blackening grey
+// objects until all roots and work buffers have been drained.
+//
+// If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
+// is set. This implies gcDrainNoBlock.
+//
+// If flags&gcDrainIdle != 0, gcDrain returns when there is other work
+// to do. This implies gcDrainNoBlock.
+//
+// If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
+// unable to get more work. Otherwise, it will block until all
+// blocking calls are blocked in gcDrain.
+//
+// If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
+// credit to gcController.bgScanCredit every gcCreditSlack units of
+// scan work.
+//
+//go:nowritebarrier
+func gcDrain(gcw *gcWork, flags gcDrainFlags) {
+	if !writeBarrier.needed {
+		throw("gcDrain phase incorrect")
+	}
+
+	gp := getg().m.curg
+	preemptible := flags&gcDrainUntilPreempt != 0
+	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0
+	flushBgCredit := flags&gcDrainFlushBgCredit != 0
+	idle := flags&gcDrainIdle != 0
+
+	initScanWork := gcw.scanWork
+	// idleCheck is the scan work at which to perform the next
+	// idle check with the scheduler.
+	idleCheck := initScanWork + idleCheckThreshold
+
+	// Drain root marking jobs.
+	if work.markrootNext < work.markrootJobs {
+		for !(preemptible && gp.preempt) {
+			job := atomic.Xadd(&work.markrootNext, +1) - 1
+			if job >= work.markrootJobs {
+				break
+			}
+			markroot(gcw, job)
+			if idle && pollWork() {
+				goto done
+			}
+		}
+	}
+
+	// Drain heap marking jobs.
+	for !(preemptible && gp.preempt) {
+		// Try to keep work available on the global queue. We used to
+		// check if there were waiting workers, but it's better to
+		// just keep work available than to make workers wait. In the
+		// worst case, we'll do O(log(_WorkbufSize)) unnecessary
+		// balances.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
+		var b uintptr
+		if blocking {
+			b = gcw.get()
+		} else {
+			b = gcw.tryGetFast()
+			if b == 0 {
+				b = gcw.tryGet()
+			}
+		}
+		if b == 0 {
+			// work barrier reached or tryGet failed.
+			break
+		}
+		scanobject(b, gcw)
+
+		// Flush background scan work credit to the global
+		// account if we've accumulated enough locally so
+		// mutator assists can draw on it.
+		if gcw.scanWork >= gcCreditSlack {
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+			if flushBgCredit {
+				gcFlushBgCredit(gcw.scanWork - initScanWork)
+				initScanWork = 0
+			}
+			idleCheck -= gcw.scanWork
+			gcw.scanWork = 0
+
+			if idle && idleCheck <= 0 {
+				idleCheck += idleCheckThreshold
+				if pollWork() {
+					break
+				}
+			}
+		}
+	}
+
+	// In blocking mode, write barriers are not allowed after this
+	// point because we must preserve the condition that the work
+	// buffers are empty.
+
+done:
+	// Flush remaining scan work credit.
+	if gcw.scanWork > 0 {
+		atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+		if flushBgCredit {
+			gcFlushBgCredit(gcw.scanWork - initScanWork)
+		}
+		gcw.scanWork = 0
+	}
+}
+
+// gcDrainN blackens grey objects until it has performed roughly
+// scanWork units of scan work or the G is preempted. This is
+// best-effort, so it may perform less work if it fails to get a work
+// buffer. Otherwise, it will perform at least n units of work, but
+// may perform more because scanning is always done in whole object
+// increments. It returns the amount of scan work performed.
+//
+// The caller goroutine must be in a preemptible state (e.g.,
+// _Gwaiting) to prevent deadlocks during stack scanning. As a
+// consequence, this must be called on the system stack.
+//
+//go:nowritebarrier
+//go:systemstack
+func gcDrainN(gcw *gcWork, scanWork int64) int64 {
+	if !writeBarrier.needed {
+		throw("gcDrainN phase incorrect")
+	}
+
+	// There may already be scan work on the gcw, which we don't
+	// want to claim was done by this call.
+	workFlushed := -gcw.scanWork
+
+	gp := getg().m.curg
+	for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
+		// See gcDrain comment.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
+		// This might be a good place to add prefetch code...
+		// if(wbuf.nobj > 4) {
+		//         PREFETCH(wbuf->obj[wbuf.nobj - 3];
+		//  }
+		//
+		b := gcw.tryGetFast()
+		if b == 0 {
+			b = gcw.tryGet()
+		}
+
+		if b == 0 {
+			// Try to do a root job.
+			//
+			// TODO: Assists should get credit for this
+			// work.
+			if work.markrootNext < work.markrootJobs {
+				job := atomic.Xadd(&work.markrootNext, +1) - 1
+				if job < work.markrootJobs {
+					markroot(gcw, job)
+					continue
+				}
+			}
+			// No heap or root jobs.
+			break
+		}
+		scanobject(b, gcw)
+
+		// Flush background scan work credit.
+		if gcw.scanWork >= gcCreditSlack {
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+			workFlushed += gcw.scanWork
+			gcw.scanWork = 0
+		}
+	}
+
+	// Unlike gcDrain, there's no need to flush remaining work
+	// here because this never flushes to bgScanCredit and
+	// gcw.dispose will flush any remaining work to scanWork.
+
+	return workFlushed + gcw.scanWork
+}
+
+// scanblock scans b as scanobject would, but using an explicit
+// pointer bitmap instead of the heap bitmap.
+//
+// This is used to scan non-heap roots, so it does not update
+// gcw.bytesMarked or gcw.scanWork.
+//
+//go:nowritebarrier
+func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
+	// Use local copies of original parameters, so that a stack trace
+	// due to one of the throws below shows the original block
+	// base and extent.
+	b := b0
+	n := n0
+
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	for i := uintptr(0); i < n; {
+		// Find bits for the next word.
+		bits := uint32(*addb(ptrmask, i/(sys.PtrSize*8)))
+		if bits == 0 {
+			i += sys.PtrSize * 8
+			continue
+		}
+		for j := 0; j < 8 && i < n; j++ {
+			if bits&1 != 0 {
+				// Same work as in scanobject; see comments there.
+				obj := *(*uintptr)(unsafe.Pointer(b + i))
+				if obj != 0 && arena_start <= obj && obj < arena_used {
+					if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i, false); obj != 0 {
+						greyobject(obj, b, i, hbits, span, gcw, objIndex, false)
+					}
+				}
+			}
+			bits >>= 1
+			i += sys.PtrSize
+		}
+	}
+}
+
+// scanobject scans the object starting at b, adding pointers to gcw.
+// b must point to the beginning of a heap object or an oblet.
+// scanobject consults the GC bitmap for the pointer mask and the
+// spans for the size of the object.
+//
+//go:nowritebarrier
+func scanobject(b uintptr, gcw *gcWork) {
+	// Note that arena_used may change concurrently during
+	// scanobject and hence scanobject may encounter a pointer to
+	// a newly allocated heap object that is *not* in
+	// [start,used). It will not mark this object; however, we
+	// know that it was just installed by a mutator, which means
+	// that mutator will execute a write barrier and take care of
+	// marking it. This is even more pronounced on relaxed memory
+	// architectures since we access arena_used without barriers
+	// or synchronization, but the same logic applies.
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	// Find the bits for b and the size of the object at b.
+	//
+	// b is either the beginning of an object, in which case this
+	// is the size of the object to scan, or it points to an
+	// oblet, in which case we compute the size to scan below.
+	hbits := heapBitsForAddr(b)
+	s := spanOfUnchecked(b)
+	n := s.elemsize
+	if n == 0 {
+		throw("scanobject n == 0")
+	}
+
+	if n > maxObletBytes {
+		// Large object. Break into oblets for better
+		// parallelism and lower latency.
+		if b == s.base() {
+			// It's possible this is a noscan object (not
+			// from greyobject, but from other code
+			// paths), in which case we must *not* enqueue
+			// oblets since their bitmaps will be
+			// uninitialized.
+			if !hbits.hasPointers(n) {
+				// Bypass the whole scan.
+				gcw.bytesMarked += uint64(n)
+				return
+			}
+
+			// Enqueue the other oblets to scan later.
+			// Some oblets may be in b's scalar tail, but
+			// these will be marked as "no more pointers",
+			// so we'll drop out immediately when we go to
+			// scan those.
+			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
+				if !gcw.putFast(oblet) {
+					gcw.put(oblet)
+				}
+			}
+		}
+
+		// Compute the size of the oblet. Since this object
+		// must be a large object, s.base() is the beginning
+		// of the object.
+		n = s.base() + s.elemsize - b
+		if n > maxObletBytes {
+			n = maxObletBytes
+		}
+	}
+
+	var i uintptr
+	for i = 0; i < n; i += sys.PtrSize {
+		// Find bits for this word.
+		if i != 0 {
+			// Avoid needless hbits.next() on last iteration.
+			hbits = hbits.next()
+		}
+		// Load bits once. See CL 22712 and issue 16973 for discussion.
+		bits := hbits.bits()
+		// During checkmarking, 1-word objects store the checkmark
+		// in the type bit for the one word. The only one-word objects
+		// are pointers, or else they'd be merged with other non-pointer
+		// data into larger allocations.
+		if i != 1*sys.PtrSize && bits&bitScan == 0 {
+			break // no more pointers in this object
+		}
+		if bits&bitPointer == 0 {
+			continue // not a pointer
+		}
+
+		// Work here is duplicated in scanblock and above.
+		// If you make changes here, make changes there too.
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// At this point we have extracted the next potential pointer.
+		// Check if it points into heap and not back at the current object.
+		if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n {
+			// Mark the object.
+			if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i, false); obj != 0 {
+				greyobject(obj, b, i, hbits, span, gcw, objIndex, false)
+			}
+		}
+	}
+	gcw.bytesMarked += uint64(n)
+	gcw.scanWork += int64(i)
+}
+
+//go:linkname scanstackblock runtime.scanstackblock
+
+// scanstackblock is called by the stack scanning code in C to
+// actually find and mark pointers in the stack block. This is like
+// scanblock, but we scan the stack conservatively, so there is no
+// bitmask of pointers.
+func scanstackblock(b, n uintptr, gcw *gcWork) {
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	for i := uintptr(0); i < n; i += sys.PtrSize {
+		// Same work as in scanobject; see comments there.
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+		if obj != 0 && arena_start <= obj && obj < arena_used {
+			if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i, true); obj != 0 {
+				greyobject(obj, b, i, hbits, span, gcw, objIndex, true)
+			}
+		}
+	}
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+// Preemption must be disabled.
+//go:nowritebarrier
+func shade(b uintptr) {
+	// shade can be called to shade a pointer found on the stack,
+	// so pass forStack as true to heapBitsForObject and greyobject.
+	if obj, hbits, span, objIndex := heapBitsForObject(b, 0, 0, true); obj != 0 {
+		gcw := &getg().m.p.ptr().gcw
+		greyobject(obj, 0, 0, hbits, span, gcw, objIndex, true)
+		if gcphase == _GCmarktermination || gcBlackenPromptly {
+			// Ps aren't allowed to cache work during mark
+			// termination.
+			gcw.dispose()
+		}
+	}
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into gcw.
+// base and off are for debugging only and could be removed.
+//go:nowritebarrierrec
+func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr, forStack bool) {
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if obj&(sys.PtrSize-1) != 0 {
+		throw("greyobject: obj not pointer-aligned")
+	}
+	mbits := span.markBitsForIndex(objIndex)
+
+	if useCheckmark {
+		if !mbits.isMarked() {
+			// Stack scanning is conservative, so we can
+			// see a reference to an object not previously
+			// found. Assume the object was correctly not
+			// marked and ignore the pointer.
+			if forStack {
+				return
+			}
+			printlock()
+			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
+			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
+
+			// Dump the source (base) object
+			gcDumpObject("base", base, off)
+
+			// Dump the object
+			gcDumpObject("obj", obj, ^uintptr(0))
+
+			throw("checkmark found unmarked object")
+		}
+		if hbits.isCheckmarked(span.elemsize) {
+			return
+		}
+		hbits.setCheckmarked(span.elemsize)
+		if !hbits.isCheckmarked(span.elemsize) {
+			throw("setCheckmarked and isCheckmarked disagree")
+		}
+	} else {
+		// Stack scanning is conservative, so we can see a
+		// pointer to a free object. Assume the object was
+		// correctly freed and we must ignore the pointer.
+		if forStack && span.isFree(objIndex) {
+			return
+		}
+
+		if debug.gccheckmark > 0 && span.isFree(objIndex) {
+			print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
+			gcDumpObject("base", base, off)
+			gcDumpObject("obj", obj, ^uintptr(0))
+			throw("marking free object")
+		}
+
+		// If marked we have nothing to do.
+		if mbits.isMarked() {
+			return
+		}
+		// mbits.setMarked() // Avoid extra call overhead with manual inlining.
+		atomic.Or8(mbits.bytep, mbits.mask)
+		// If this is a noscan object, fast-track it to black
+		// instead of greying it.
+		if !hbits.hasPointers(span.elemsize) {
+			gcw.bytesMarked += uint64(span.elemsize)
+			return
+		}
+	}
+
+	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
+	// seems like a nice optimization that can be added back in.
+	// There needs to be time between the PREFETCH and the use.
+	// Previously we put the obj in an 8 element buffer that is drained at a rate
+	// to give the PREFETCH time to do its work.
+	// Use of PREFETCHNTA might be more appropriate than PREFETCH
+	if !gcw.putFast(obj) {
+		gcw.put(obj)
+	}
+}
+
+// gcDumpObject dumps the contents of obj for debugging and marks the
+// field at byte offset off in obj.
+func gcDumpObject(label string, obj, off uintptr) {
+	if obj < mheap_.arena_start || obj >= mheap_.arena_used {
+		print(label, "=", hex(obj), " is not in the Go heap\n")
+		return
+	}
+	k := obj >> _PageShift
+	x := k
+	x -= mheap_.arena_start >> _PageShift
+	s := mheap_.spans[x]
+	print(label, "=", hex(obj), " k=", hex(k))
+	if s == nil {
+		print(" s=nil\n")
+		return
+	}
+	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, " s.state=")
+	if 0 <= s.state && int(s.state) < len(mSpanStateNames) {
+		print(mSpanStateNames[s.state], "\n")
+	} else {
+		print("unknown(", s.state, ")\n")
+	}
+
+	skipped := false
+	size := s.elemsize
+	if s.state == _MSpanStack && size == 0 {
+		// We're printing something from a stack frame. We
+		// don't know how big it is, so just show up to an
+		// including off.
+		size = off + sys.PtrSize
+	}
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		// For big objects, just print the beginning (because
+		// that usually hints at the object's type) and the
+		// fields around off.
+		if !(i < 128*sys.PtrSize || off-16*sys.PtrSize < i && i < off+16*sys.PtrSize) {
+			skipped = true
+			continue
+		}
+		if skipped {
+			print(" ...\n")
+			skipped = false
+		}
+		print(" *(", label, "+", i, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + i))))
+		if i == off {
+			print(" <==")
+		}
+		print("\n")
+	}
+	if skipped {
+		print(" ...\n")
+	}
+}
+
+// gcmarknewobject marks a newly allocated object black. obj must
+// not contain any non-nil pointers.
+//
+// This is nosplit so it can manipulate a gcWork without preemption.
+//
+//go:nowritebarrier
+//go:nosplit
+func gcmarknewobject(obj, size, scanSize uintptr) {
+	if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
+		throw("gcmarknewobject called while doing checkmark")
+	}
+	markBitsForAddr(obj).setMarked()
+	gcw := &getg().m.p.ptr().gcw
+	gcw.bytesMarked += uint64(size)
+	gcw.scanWork += int64(scanSize)
+	if gcBlackenPromptly {
+		// There shouldn't be anything in the work queue, but
+		// we still need to flush stats.
+		gcw.dispose()
+	}
+}
+
+// gcMarkTinyAllocs greys all active tiny alloc blocks.
+//
+// The world must be stopped.
+func gcMarkTinyAllocs() {
+	for _, p := range &allp {
+		if p == nil || p.status == _Pdead {
+			break
+		}
+		c := p.mcache
+		if c == nil || c.tiny == 0 {
+			continue
+		}
+		_, hbits, span, objIndex := heapBitsForObject(c.tiny, 0, 0, false)
+		gcw := &p.gcw
+		greyobject(c.tiny, 0, 0, hbits, span, gcw, objIndex, false)
+		if gcBlackenPromptly {
+			gcw.dispose()
+		}
+	}
+}
+
+// Checkmarking
+
+// To help debug the concurrent GC we remark with the world
+// stopped ensuring that any object encountered has their normal
+// mark bit set. To do this we use an orthogonal bit
+// pattern to indicate the object is marked. The following pattern
+// uses the upper two bits in the object's boundary nibble.
+// 01: scalar  not marked
+// 10: pointer not marked
+// 11: pointer     marked
+// 00: scalar      marked
+// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
+// The higher bit is 1 for pointers and 0 for scalars, whether the object
+// is marked or not.
+// The first nibble no longer holds the typeDead pattern indicating that the
+// there are no more pointers in the object. This information is held
+// in the second nibble.
+
+// If useCheckmark is true, marking of an object uses the
+// checkmark bits (encoding above) instead of the standard
+// mark bits.
+var useCheckmark = false
+
+//go:nowritebarrier
+func initCheckmarks() {
+	useCheckmark = true
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			heapBitsForSpan(s.base()).initCheckmarkSpan(s.layout())
+		}
+	}
+}
+
+func clearCheckmarks() {
+	useCheckmark = false
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			heapBitsForSpan(s.base()).clearCheckmarkSpan(s.layout())
+		}
+	}
+}
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
new file mode 100644
index 0000000..9f24fb1
--- /dev/null
+++ b/libgo/go/runtime/mgcsweep.go
@@ -0,0 +1,428 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: sweeping
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var sweep sweepdata
+
+// State of background sweep.
+type sweepdata struct {
+	lock    mutex
+	g       *g
+	parked  bool
+	started bool
+
+	nbgsweep    uint32
+	npausesweep uint32
+
+	// pacertracegen is the sweepgen at which the last pacer trace
+	// "sweep finished" message was printed.
+	pacertracegen uint32
+}
+
+// finishsweep_m ensures that all spans are swept.
+//
+// The world must be stopped. This ensures there are no sweeps in
+// progress.
+//
+//go:nowritebarrier
+func finishsweep_m() {
+	// Sweeping must be complete before marking commences, so
+	// sweep any unswept spans. If this is a concurrent GC, there
+	// shouldn't be any spans left to sweep, so this should finish
+	// instantly. If GC was forced before the concurrent sweep
+	// finished, there may be spans to sweep.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	nextMarkBitArenaEpoch()
+}
+
+func bgsweep(c chan int) {
+	sweep.g = getg()
+
+	lock(&sweep.lock)
+	sweep.parked = true
+	c <- 1
+	goparkunlock(&sweep.lock, "GC sweep wait", traceEvGoBlock, 1)
+
+	for {
+		for gosweepone() != ^uintptr(0) {
+			sweep.nbgsweep++
+			Gosched()
+		}
+		lock(&sweep.lock)
+		if !gosweepdone() {
+			// This can happen if a GC runs between
+			// gosweepone returning ^0 above
+			// and the lock being acquired.
+			unlock(&sweep.lock)
+			continue
+		}
+		sweep.parked = true
+		goparkunlock(&sweep.lock, "GC sweep wait", traceEvGoBlock, 1)
+	}
+}
+
+// sweeps one span
+// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
+//go:nowritebarrier
+func sweepone() uintptr {
+	_g_ := getg()
+
+	// increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	_g_.m.locks++
+	sg := mheap_.sweepgen
+	for {
+		s := mheap_.sweepSpans[1-sg/2%2].pop()
+		if s == nil {
+			mheap_.sweepdone = 1
+			_g_.m.locks--
+			if debug.gcpacertrace > 0 && atomic.Cas(&sweep.pacertracegen, sg-2, sg) {
+				print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", mheap_.spanBytesAlloc>>20, "MB of spans; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+			}
+			return ^uintptr(0)
+		}
+		if s.state != mSpanInUse {
+			// This can happen if direct sweeping already
+			// swept this span, but in that case the sweep
+			// generation should always be up-to-date.
+			if s.sweepgen != sg {
+				print("runtime: bad span s.state=", s.state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
+				throw("non in-use span in unswept list")
+			}
+			continue
+		}
+		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			continue
+		}
+		npages := s.npages
+		if !s.sweep(false) {
+			// Span is still in-use, so this returned no
+			// pages to the heap and the span needs to
+			// move to the swept in-use list.
+			npages = 0
+		}
+		_g_.m.locks--
+		return npages
+	}
+}
+
+//go:nowritebarrier
+func gosweepone() uintptr {
+	var ret uintptr
+	systemstack(func() {
+		ret = sweepone()
+	})
+	return ret
+}
+
+//go:nowritebarrier
+func gosweepdone() bool {
+	return mheap_.sweepdone != 0
+}
+
+// Returns only when span s has been swept.
+//go:nowritebarrier
+func (s *mspan) ensureSwept() {
+	// Caller must disable preemption.
+	// Otherwise when this function returns the span can become unswept again
+	// (if GC is triggered on another goroutine).
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("MSpan_EnsureSwept: m is not locked")
+	}
+
+	sg := mheap_.sweepgen
+	if atomic.Load(&s.sweepgen) == sg {
+		return
+	}
+	// The caller must be sure that the span is a MSpanInUse span.
+	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		s.sweep(false)
+		return
+	}
+	// unfortunate condition, and we don't have efficient means to wait
+	for atomic.Load(&s.sweepgen) != sg {
+		osyield()
+	}
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in MCentral lists;
+// caller takes care of it.
+//TODO go:nowritebarrier
+func (s *mspan) sweep(preserve bool) bool {
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("MSpan_Sweep: m is not locked")
+	}
+	sweepgen := mheap_.sweepgen
+	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("MSpan_Sweep: bad span state")
+	}
+
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
+
+	cl := s.sizeclass
+	size := s.elemsize
+	res := false
+	nfree := 0
+
+	c := _g_.m.mcache
+	freeToHeap := false
+
+	// The allocBits indicate which unmarked objects don't need to be
+	// processed since they were free at the end of the last GC cycle
+	// and were not allocated since then.
+	// If the allocBits index is >= s.freeindex and the bit
+	// is not marked then the object remains unallocated
+	// since the last GC.
+	// This situation is analogous to being on a freelist.
+
+	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
+	specialp := &s.specials
+	special := *specialp
+	for special != nil {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		objIndex := uintptr(special.offset) / size
+		p := s.base() + objIndex*size
+		mbits := s.markBitsForIndex(objIndex)
+		if !mbits.isMarked() {
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - s.base() + size
+			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					mbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for special != nil && uintptr(special.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				p := s.base() + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					// Splice out special record.
+					y := special
+					special = special.next
+					*specialp = special
+					freespecial(y, unsafe.Pointer(p), size)
+				} else {
+					// This is profile record, but the object has finalizers (so kept alive).
+					// Keep special record.
+					specialp = &special.next
+					special = *specialp
+				}
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special.next
+			special = *specialp
+		}
+	}
+
+	if debug.allocfreetrace != 0 || raceenabled || msanenabled {
+		// Find all newly freed objects. This doesn't have to
+		// efficient; allocfreetrace has massive overhead.
+		mbits := s.markBitsForBase()
+		abits := s.allocBitsForIndex(0)
+		for i := uintptr(0); i < s.nelems; i++ {
+			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
+				x := s.base() + i*s.elemsize
+				if debug.allocfreetrace != 0 {
+					tracefree(unsafe.Pointer(x), size)
+				}
+				if raceenabled {
+					racefree(unsafe.Pointer(x), size)
+				}
+				if msanenabled {
+					msanfree(unsafe.Pointer(x), size)
+				}
+			}
+			mbits.advance()
+			abits.advance()
+		}
+	}
+
+	// Count the number of free objects in this span.
+	nfree = s.countFree()
+	if cl == 0 && nfree != 0 {
+		s.needzero = 1
+		freeToHeap = true
+	}
+	nalloc := uint16(s.nelems) - uint16(nfree)
+	nfreed := s.allocCount - nalloc
+
+	// This test is not reliable with gccgo, because of
+	// conservative stack scanning. The test boils down to
+	// checking that no new bits have been set in gcmarkBits since
+	// the span was added to the sweep count. New bits are set by
+	// greyobject. Seeing a new bit means that a live pointer has
+	// appeared that was not found during the mark phase. That can
+	// not happen when pointers are followed strictly. However,
+	// with conservative checking, it is possible for a pointer
+	// that will never be used to appear live and to cause a mark
+	// to be added. That is unfortunate in that it causes this
+	// check to be inaccurate, and it will keep an object live
+	// unnecessarily, but provided the pointer is not really live
+	// it is not otherwise a problem. So we disable the test for gccgo.
+	if false && nalloc > s.allocCount {
+		print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		throw("sweep increased allocation count")
+	}
+
+	s.allocCount = nalloc
+	wasempty := s.nextFreeIndex() == s.nelems
+	s.freeindex = 0 // reset allocation index to start of span.
+
+	// gcmarkBits becomes the allocBits.
+	// get a fresh cleared gcmarkBits in preparation for next GC
+	s.allocBits = s.gcmarkBits
+	s.gcmarkBits = newMarkBits(s.nelems)
+
+	// Initialize alloc bits cache.
+	s.refillAllocCache(0)
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	if freeToHeap || nfreed == 0 {
+		// The span must be in our exclusive ownership until we update sweepgen,
+		// check for potential races.
+		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+			throw("MSpan_Sweep: bad span state after sweep")
+		}
+		// Serialization point.
+		// At this point the mark bits are cleared and allocation ready
+		// to go so release the span.
+		atomic.Store(&s.sweepgen, sweepgen)
+	}
+
+	if nfreed > 0 && cl != 0 {
+		c.local_nsmallfree[cl] += uintptr(nfreed)
+		res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty)
+		// MCentral_FreeSpan updates sweepgen
+	} else if freeToHeap {
+		// Free large span to heap
+
+		// NOTE(rsc,dvyukov): The original implementation of efence
+		// in CL 22060046 used SysFree instead of SysFault, so that
+		// the operating system would eventually give the memory
+		// back to us again, so that an efence program could run
+		// longer without running out of memory. Unfortunately,
+		// calling SysFree here without any kind of adjustment of the
+		// heap data structures means that when the memory does
+		// come back to us, we have the wrong metadata for it, either in
+		// the MSpan structures or in the garbage collection bitmap.
+		// Using SysFault here means that the program will run out of
+		// memory fairly quickly in efence mode, but at least it won't
+		// have mysterious crashes due to confused memory reuse.
+		// It should be possible to switch back to SysFree if we also
+		// implement and then call some kind of MHeap_DeleteSpan.
+		if debug.efence > 0 {
+			s.limit = 0 // prevent mlookup from finding this span
+			sysFault(unsafe.Pointer(s.base()), size)
+		} else {
+			mheap_.freeSpan(s, 1)
+		}
+		c.local_nlargefree++
+		c.local_largefree += size
+		res = true
+	}
+	if !res {
+		// The span has been swept and is still in-use, so put
+		// it on the swept in-use list.
+		mheap_.sweepSpans[sweepgen/2%2].push(s)
+	}
+	if trace.enabled {
+		traceGCSweepDone()
+	}
+	return res
+}
+
+// deductSweepCredit deducts sweep credit for allocating a span of
+// size spanBytes. This must be performed *before* the span is
+// allocated to ensure the system has enough credit. If necessary, it
+// performs sweeping to prevent going in to debt. If the caller will
+// also sweep pages (e.g., for a large allocation), it can pass a
+// non-zero callerSweepPages to leave that many pages unswept.
+//
+// deductSweepCredit makes a worst-case assumption that all spanBytes
+// bytes of the ultimately allocated span will be available for object
+// allocation. The caller should call reimburseSweepCredit if that
+// turns out not to be the case once the span is allocated.
+//
+// deductSweepCredit is the core of the "proportional sweep" system.
+// It uses statistics gathered by the garbage collector to perform
+// enough sweeping so that all pages are swept during the concurrent
+// sweep phase between GC cycles.
+//
+// mheap_ must NOT be locked.
+func deductSweepCredit(spanBytes uintptr, callerSweepPages uintptr) {
+	if mheap_.sweepPagesPerByte == 0 {
+		// Proportional sweep is done or disabled.
+		return
+	}
+
+	// Account for this span allocation.
+	spanBytesAlloc := atomic.Xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
+
+	// Fix debt if necessary.
+	pagesOwed := int64(mheap_.sweepPagesPerByte * float64(spanBytesAlloc))
+	for pagesOwed-int64(atomic.Load64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
+		if gosweepone() == ^uintptr(0) {
+			mheap_.sweepPagesPerByte = 0
+			break
+		}
+	}
+}
+
+// reimburseSweepCredit records that unusableBytes bytes of a
+// just-allocated span are not available for object allocation. This
+// offsets the worst-case charge performed by deductSweepCredit.
+func reimburseSweepCredit(unusableBytes uintptr) {
+	if mheap_.sweepPagesPerByte == 0 {
+		// Nobody cares about the credit. Avoid the atomic.
+		return
+	}
+	nval := atomic.Xadd64(&mheap_.spanBytesAlloc, -int64(unusableBytes))
+	if int64(nval) < 0 {
+		// Debugging for #18043.
+		print("runtime: bad spanBytesAlloc=", nval, " (was ", nval+uint64(unusableBytes), ") unusableBytes=", unusableBytes, " sweepPagesPerByte=", mheap_.sweepPagesPerByte, "\n")
+		throw("spanBytesAlloc underflow")
+	}
+}
diff --git a/libgo/go/runtime/mgcsweepbuf.go b/libgo/go/runtime/mgcsweepbuf.go
new file mode 100644
index 0000000..6c1118e
--- /dev/null
+++ b/libgo/go/runtime/mgcsweepbuf.go
@@ -0,0 +1,178 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A gcSweepBuf is a set of *mspans.
+//
+// gcSweepBuf is safe for concurrent push operations *or* concurrent
+// pop operations, but not both simultaneously.
+type gcSweepBuf struct {
+	// A gcSweepBuf is a two-level data structure consisting of a
+	// growable spine that points to fixed-sized blocks. The spine
+	// can be accessed without locks, but adding a block or
+	// growing it requires taking the spine lock.
+	//
+	// Because each mspan covers at least 8K of heap and takes at
+	// most 8 bytes in the gcSweepBuf, the growth of the spine is
+	// quite limited.
+	//
+	// The spine and all blocks are allocated off-heap, which
+	// allows this to be used in the memory manager and avoids the
+	// need for write barriers on all of these. We never release
+	// this memory because there could be concurrent lock-free
+	// access and we're likely to reuse it anyway. (In principle,
+	// we could do this during STW.)
+
+	spineLock mutex
+	spine     unsafe.Pointer // *[N]*gcSweepBlock, accessed atomically
+	spineLen  uintptr        // Spine array length, accessed atomically
+	spineCap  uintptr        // Spine array cap, accessed under lock
+
+	// index is the first unused slot in the logical concatenation
+	// of all blocks. It is accessed atomically.
+	index uint32
+}
+
+const (
+	gcSweepBlockEntries    = 512 // 4KB on 64-bit
+	gcSweepBufInitSpineCap = 256 // Enough for 1GB heap on 64-bit
+)
+
+type gcSweepBlock struct {
+	spans [gcSweepBlockEntries]*mspan
+}
+
+// push adds span s to buffer b. push is safe to call concurrently
+// with other push operations, but NOT to call concurrently with pop.
+func (b *gcSweepBuf) push(s *mspan) {
+	// Obtain our slot.
+	cursor := uintptr(atomic.Xadd(&b.index, +1) - 1)
+	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
+
+	// Do we need to add a block?
+	spineLen := atomic.Loaduintptr(&b.spineLen)
+	var block *gcSweepBlock
+retry:
+	if top < spineLen {
+		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+		blockp := add(spine, sys.PtrSize*top)
+		block = (*gcSweepBlock)(atomic.Loadp(blockp))
+	} else {
+		// Add a new block to the spine, potentially growing
+		// the spine.
+		lock(&b.spineLock)
+		// spineLen cannot change until we release the lock,
+		// but may have changed while we were waiting.
+		spineLen = atomic.Loaduintptr(&b.spineLen)
+		if top < spineLen {
+			unlock(&b.spineLock)
+			goto retry
+		}
+
+		if spineLen == b.spineCap {
+			// Grow the spine.
+			newCap := b.spineCap * 2
+			if newCap == 0 {
+				newCap = gcSweepBufInitSpineCap
+			}
+			newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys)
+			if b.spineCap != 0 {
+				// Blocks are allocated off-heap, so
+				// no write barriers.
+				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
+			}
+			// Spine is allocated off-heap, so no write barrier.
+			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
+			b.spineCap = newCap
+			// We can't immediately free the old spine
+			// since a concurrent push with a lower index
+			// could still be reading from it. We let it
+			// leak because even a 1TB heap would waste
+			// less than 2MB of memory on old spines. If
+			// this is a problem, we could free old spines
+			// during STW.
+		}
+
+		// Allocate a new block and add it to the spine.
+		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys))
+		blockp := add(b.spine, sys.PtrSize*top)
+		// Blocks are allocated off-heap, so no write barrier.
+		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
+		atomic.Storeuintptr(&b.spineLen, spineLen+1)
+		unlock(&b.spineLock)
+	}
+
+	// We have a block. Insert the span.
+	block.spans[bottom] = s
+}
+
+// pop removes and returns a span from buffer b, or nil if b is empty.
+// pop is safe to call concurrently with other pop operations, but NOT
+// to call concurrently with push.
+func (b *gcSweepBuf) pop() *mspan {
+	cursor := atomic.Xadd(&b.index, -1)
+	if int32(cursor) < 0 {
+		atomic.Xadd(&b.index, +1)
+		return nil
+	}
+
+	// There are no concurrent spine or block modifications during
+	// pop, so we can omit the atomics.
+	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
+	blockp := (**gcSweepBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
+	block := *blockp
+	s := block.spans[bottom]
+	// Clear the pointer for block(i).
+	block.spans[bottom] = nil
+	return s
+}
+
+// numBlocks returns the number of blocks in buffer b. numBlocks is
+// safe to call concurrently with any other operation. Spans that have
+// been pushed prior to the call to numBlocks are guaranteed to appear
+// in some block in the range [0, numBlocks()), assuming there are no
+// intervening pops. Spans that are pushed after the call may also
+// appear in these blocks.
+func (b *gcSweepBuf) numBlocks() int {
+	return int((atomic.Load(&b.index) + gcSweepBlockEntries - 1) / gcSweepBlockEntries)
+}
+
+// block returns the spans in the i'th block of buffer b. block is
+// safe to call concurrently with push.
+func (b *gcSweepBuf) block(i int) []*mspan {
+	// Perform bounds check before loading spine address since
+	// push ensures the allocated length is at least spineLen.
+	if i < 0 || uintptr(i) >= atomic.Loaduintptr(&b.spineLen) {
+		throw("block index out of range")
+	}
+
+	// Get block i.
+	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+	blockp := add(spine, sys.PtrSize*uintptr(i))
+	block := (*gcSweepBlock)(atomic.Loadp(blockp))
+
+	// Slice the block if necessary.
+	cursor := uintptr(atomic.Load(&b.index))
+	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
+	var spans []*mspan
+	if uintptr(i) < top {
+		spans = block.spans[:]
+	} else {
+		spans = block.spans[:bottom]
+	}
+
+	// push may have reserved a slot but not filled it yet, so
+	// trim away unused entries.
+	for len(spans) > 0 && spans[len(spans)-1] == nil {
+		spans = spans[:len(spans)-1]
+	}
+	return spans
+}
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
new file mode 100644
index 0000000..5eb05a7
--- /dev/null
+++ b/libgo/go/runtime/mgcwork.go
@@ -0,0 +1,444 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	_WorkbufSize = 2048 // in bytes; larger values result in less contention
+)
+
+// Garbage collector work pool abstraction.
+//
+// This implements a producer/consumer model for pointers to grey
+// objects. A grey object is one that is marked and on a work
+// queue. A black object is marked and not on a work queue.
+//
+// Write barriers, root discovery, stack scanning, and object scanning
+// produce pointers to grey objects. Scanning consumes pointers to
+// grey objects, thus blackening them, and then scans them,
+// potentially producing new pointers to grey objects.
+
+// A wbufptr holds a workbuf*, but protects it from write barriers.
+// workbufs never live on the heap, so write barriers are unnecessary.
+// Write barriers on workbuf pointers may also be dangerous in the GC.
+//
+// TODO: Since workbuf is now go:notinheap, this isn't necessary.
+type wbufptr uintptr
+
+func wbufptrOf(w *workbuf) wbufptr {
+	return wbufptr(unsafe.Pointer(w))
+}
+
+func (wp wbufptr) ptr() *workbuf {
+	return (*workbuf)(unsafe.Pointer(wp))
+}
+
+// A gcWork provides the interface to produce and consume work for the
+// garbage collector.
+//
+// A gcWork can be used on the stack as follows:
+//
+//     (preemption must be disabled)
+//     gcw := &getg().m.p.ptr().gcw
+//     .. call gcw.put() to produce and gcw.get() to consume ..
+//     if gcBlackenPromptly {
+//         gcw.dispose()
+//     }
+//
+// It's important that any use of gcWork during the mark phase prevent
+// the garbage collector from transitioning to mark termination since
+// gcWork may locally hold GC work buffers. This can be done by
+// disabling preemption (systemstack or acquirem).
+type gcWork struct {
+	// wbuf1 and wbuf2 are the primary and secondary work buffers.
+	//
+	// This can be thought of as a stack of both work buffers'
+	// pointers concatenated. When we pop the last pointer, we
+	// shift the stack up by one work buffer by bringing in a new
+	// full buffer and discarding an empty one. When we fill both
+	// buffers, we shift the stack down by one work buffer by
+	// bringing in a new empty buffer and discarding a full one.
+	// This way we have one buffer's worth of hysteresis, which
+	// amortizes the cost of getting or putting a work buffer over
+	// at least one buffer of work and reduces contention on the
+	// global work lists.
+	//
+	// wbuf1 is always the buffer we're currently pushing to and
+	// popping from and wbuf2 is the buffer that will be discarded
+	// next.
+	//
+	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
+	wbuf1, wbuf2 wbufptr
+
+	// Bytes marked (blackened) on this gcWork. This is aggregated
+	// into work.bytesMarked by dispose.
+	bytesMarked uint64
+
+	// Scan work performed on this gcWork. This is aggregated into
+	// gcController by dispose and may also be flushed by callers.
+	scanWork int64
+}
+
+func (w *gcWork) init() {
+	w.wbuf1 = wbufptrOf(getempty())
+	wbuf2 := trygetfull()
+	if wbuf2 == nil {
+		wbuf2 = getempty()
+	}
+	w.wbuf2 = wbufptrOf(wbuf2)
+}
+
+// put enqueues a pointer for the garbage collector to trace.
+// obj must point to the beginning of a heap object or an oblet.
+//go:nowritebarrier
+func (w *gcWork) put(obj uintptr) {
+	flushed := false
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	} else if wbuf.nobj == len(wbuf.obj) {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			wbuf = getempty()
+			w.wbuf1 = wbufptrOf(wbuf)
+			flushed = true
+		}
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+
+	// If we put a buffer on full, let the GC controller know so
+	// it can encourage more workers to run. We delay this until
+	// the end of put so that w is in a consistent state, since
+	// enlistWorker may itself manipulate w.
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// putFast does a put and returns true if it can be done quickly
+// otherwise it returns false and the caller needs to call put.
+//go:nowritebarrier
+func (w *gcWork) putFast(obj uintptr) bool {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		return false
+	} else if wbuf.nobj == len(wbuf.obj) {
+		return false
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+	return true
+}
+
+// tryGet dequeues a pointer for the garbage collector to trace.
+//
+// If there are no pointers remaining in this gcWork or in the global
+// queue, tryGet returns 0.  Note that there may still be pointers in
+// other gcWork instances or other caches.
+//go:nowritebarrier
+func (w *gcWork) tryGet() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = trygetfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbufptrOf(wbuf)
+		}
+	}
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// tryGetFast dequeues a pointer for the garbage collector to trace
+// if one is readily available. Otherwise it returns 0 and
+// the caller is expected to call tryGet().
+//go:nowritebarrier
+func (w *gcWork) tryGetFast() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		return 0
+	}
+	if wbuf.nobj == 0 {
+		return 0
+	}
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// get dequeues a pointer for the garbage collector to trace, blocking
+// if necessary to ensure all pointers from all queues and caches have
+// been retrieved.  get returns 0 if there are no pointers remaining.
+//go:nowritebarrier
+func (w *gcWork) get() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = getfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbufptrOf(wbuf)
+		}
+	}
+
+	// TODO: This might be a good place to add prefetch code
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// dispose returns any cached pointers to the global queue.
+// The buffers are being put on the full queue so that the
+// write barriers will not simply reacquire them before the
+// GC can inspect them. This helps reduce the mutator's
+// ability to hide pointers during the concurrent mark phase.
+//
+//go:nowritebarrier
+func (w *gcWork) dispose() {
+	if wbuf := w.wbuf1.ptr(); wbuf != nil {
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+		}
+		w.wbuf1 = 0
+
+		wbuf = w.wbuf2.ptr()
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+		}
+		w.wbuf2 = 0
+	}
+	if w.bytesMarked != 0 {
+		// dispose happens relatively infrequently. If this
+		// atomic becomes a problem, we should first try to
+		// dispose less and if necessary aggregate in a per-P
+		// counter.
+		atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
+		w.bytesMarked = 0
+	}
+	if w.scanWork != 0 {
+		atomic.Xaddint64(&gcController.scanWork, w.scanWork)
+		w.scanWork = 0
+	}
+}
+
+// balance moves some work that's cached in this gcWork back on the
+// global queue.
+//go:nowritebarrier
+func (w *gcWork) balance() {
+	if w.wbuf1 == 0 {
+		return
+	}
+	if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
+		putfull(wbuf)
+		w.wbuf2 = wbufptrOf(getempty())
+	} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
+		w.wbuf1 = wbufptrOf(handoff(wbuf))
+	} else {
+		return
+	}
+	// We flushed a buffer to the full list, so wake a worker.
+	if gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// empty returns true if w has no mark work available.
+//go:nowritebarrier
+func (w *gcWork) empty() bool {
+	return w.wbuf1 == 0 || (w.wbuf1.ptr().nobj == 0 && w.wbuf2.ptr().nobj == 0)
+}
+
+// Internally, the GC work pool is kept in arrays in work buffers.
+// The gcWork interface caches a work buffer until full (or empty) to
+// avoid contending on the global work buffer lists.
+
+type workbufhdr struct {
+	node lfnode // must be first
+	nobj int
+}
+
+//go:notinheap
+type workbuf struct {
+	workbufhdr
+	// account for the above fields
+	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / sys.PtrSize]uintptr
+}
+
+// workbuf factory routines. These funcs are used to manage the
+// workbufs.
+// If the GC asks for some work these are the only routines that
+// make wbufs available to the GC.
+
+func (b *workbuf) checknonempty() {
+	if b.nobj == 0 {
+		throw("workbuf is empty")
+	}
+}
+
+func (b *workbuf) checkempty() {
+	if b.nobj != 0 {
+		throw("workbuf is not empty")
+	}
+}
+
+// getempty pops an empty work buffer off the work.empty list,
+// allocating new buffers if none are available.
+//go:nowritebarrier
+func getempty() *workbuf {
+	var b *workbuf
+	if work.empty != 0 {
+		b = (*workbuf)(lfstackpop(&work.empty))
+		if b != nil {
+			b.checkempty()
+		}
+	}
+	if b == nil {
+		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), sys.CacheLineSize, &memstats.gc_sys))
+	}
+	return b
+}
+
+// putempty puts a workbuf onto the work.empty list.
+// Upon entry this go routine owns b. The lfstackpush relinquishes ownership.
+//go:nowritebarrier
+func putempty(b *workbuf) {
+	b.checkempty()
+	lfstackpush(&work.empty, &b.node)
+}
+
+// putfull puts the workbuf on the work.full list for the GC.
+// putfull accepts partially full buffers so the GC can avoid competing
+// with the mutators for ownership of partially full buffers.
+//go:nowritebarrier
+func putfull(b *workbuf) {
+	b.checknonempty()
+	lfstackpush(&work.full, &b.node)
+}
+
+// trygetfull tries to get a full or partially empty workbuffer.
+// If one is not immediately available return nil
+//go:nowritebarrier
+func trygetfull() *workbuf {
+	b := (*workbuf)(lfstackpop(&work.full))
+	if b != nil {
+		b.checknonempty()
+		return b
+	}
+	return b
+}
+
+// Get a full work buffer off the work.full list.
+// If nothing is available wait until all the other gc helpers have
+// finished and then return nil.
+// getfull acts as a barrier for work.nproc helpers. As long as one
+// gchelper is actively marking objects it
+// may create a workbuffer that the other helpers can work on.
+// The for loop either exits when a work buffer is found
+// or when _all_ of the work.nproc GC helpers are in the loop
+// looking for work and thus not capable of creating new work.
+// This is in fact the termination condition for the STW mark
+// phase.
+//go:nowritebarrier
+func getfull() *workbuf {
+	b := (*workbuf)(lfstackpop(&work.full))
+	if b != nil {
+		b.checknonempty()
+		return b
+	}
+
+	incnwait := atomic.Xadd(&work.nwait, +1)
+	if incnwait > work.nproc {
+		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+		throw("work.nwait > work.nproc")
+	}
+	for i := 0; ; i++ {
+		if work.full != 0 {
+			decnwait := atomic.Xadd(&work.nwait, -1)
+			if decnwait == work.nproc {
+				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
+			b = (*workbuf)(lfstackpop(&work.full))
+			if b != nil {
+				b.checknonempty()
+				return b
+			}
+			incnwait := atomic.Xadd(&work.nwait, +1)
+			if incnwait > work.nproc {
+				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
+		}
+		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
+			return nil
+		}
+		_g_ := getg()
+		if i < 10 {
+			_g_.m.gcstats.nprocyield++
+			procyield(20)
+		} else if i < 20 {
+			_g_.m.gcstats.nosyield++
+			osyield()
+		} else {
+			_g_.m.gcstats.nsleep++
+			usleep(100)
+		}
+	}
+}
+
+//go:nowritebarrier
+func handoff(b *workbuf) *workbuf {
+	// Make new buffer with half of b's pointers.
+	b1 := getempty()
+	n := b.nobj / 2
+	b.nobj -= n
+	b1.nobj = n
+	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
+	_g_ := getg()
+	_g_.m.gcstats.nhandoff++
+	_g_.m.gcstats.nhandoffcnt += uint64(n)
+
+	// Put b on full list - let first half of b get stolen.
+	putfull(b)
+	return b1
+}
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
new file mode 100644
index 0000000..7262748
--- /dev/null
+++ b/libgo/go/runtime/mheap.go
@@ -0,0 +1,1427 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.go for overview.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// minPhysPageSize is a lower-bound on the physical page size. The
+// true physical page size may be larger than this. In contrast,
+// sys.PhysPageSize is an upper-bound on the physical page size.
+const minPhysPageSize = 4096
+
+// Main malloc heap.
+// The heap itself is the "free[]" and "large" arrays,
+// but all the other global data is here too.
+//
+// mheap must not be heap-allocated because it contains mSpanLists,
+// which must not be heap-allocated.
+//
+//go:notinheap
+type mheap struct {
+	lock      mutex
+	free      [_MaxMHeapList]mSpanList // free lists of given length
+	freelarge mSpanList                // free lists length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mSpanList // busy lists of large objects of given length
+	busylarge mSpanList                // busy lists of large objects length >= _MaxMHeapList
+	sweepgen  uint32                   // sweep generation, see comment in mspan
+	sweepdone uint32                   // all spans are swept
+
+	// allspans is a slice of all mspans ever created. Each mspan
+	// appears exactly once.
+	//
+	// The memory for allspans is manually managed and can be
+	// reallocated and move as the heap grows.
+	//
+	// In general, allspans is protected by mheap_.lock, which
+	// prevents concurrent access as well as freeing the backing
+	// store. Accesses during STW might not hold the lock, but
+	// must ensure that allocation cannot happen around the
+	// access (since that may free the backing store).
+	allspans []*mspan // all spans out there
+
+	// spans is a lookup table to map virtual address page IDs to *mspan.
+	// For allocated spans, their pages map to the span itself.
+	// For free spans, only the lowest and highest pages map to the span itself.
+	// Internal pages map to an arbitrary span.
+	// For pages that have never been allocated, spans entries are nil.
+	//
+	// This is backed by a reserved region of the address space so
+	// it can grow without moving. The memory up to len(spans) is
+	// mapped. cap(spans) indicates the total reserved memory.
+	spans []*mspan
+
+	// sweepSpans contains two mspan stacks: one of swept in-use
+	// spans, and one of unswept in-use spans. These two trade
+	// roles on each GC cycle. Since the sweepgen increases by 2
+	// on each cycle, this means the swept spans are in
+	// sweepSpans[sweepgen/2%2] and the unswept spans are in
+	// sweepSpans[1-sweepgen/2%2]. Sweeping pops spans from the
+	// unswept stack and pushes spans that are still in-use on the
+	// swept stack. Likewise, allocating an in-use span pushes it
+	// on the swept stack.
+	sweepSpans [2]gcSweepBuf
+
+	_ uint32 // align uint64 fields on 32-bit for atomics
+
+	// Proportional sweep
+	pagesInUse        uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
+	spanBytesAlloc    uint64  // bytes of spans allocated this cycle; updated atomically
+	pagesSwept        uint64  // pages swept this cycle; updated atomically
+	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
+	// TODO(austin): pagesInUse should be a uintptr, but the 386
+	// compiler can't 8-byte align fields.
+
+	// Malloc stats.
+	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+
+	// range of addresses we might see in the heap
+	bitmap         uintptr // Points to one byte past the end of the bitmap
+	bitmap_mapped  uintptr
+	arena_start    uintptr
+	arena_used     uintptr // always mHeap_Map{Bits,Spans} before updating
+	arena_end      uintptr
+	arena_reserved bool
+
+	// central free lists for small size classes.
+	// the padding makes sure that the MCentrals are
+	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// gets its own cache line.
+	central [_NumSizeClasses]struct {
+		mcentral mcentral
+		pad      [sys.CacheLineSize]byte
+	}
+
+	spanalloc             fixalloc // allocator for span*
+	cachealloc            fixalloc // allocator for mcache*
+	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+	specialprofilealloc   fixalloc // allocator for specialprofile*
+	speciallock           mutex    // lock for special record allocators.
+}
+
+var mheap_ mheap
+
+// An MSpan is a run of pages.
+//
+// When a MSpan is in the heap free list, state == MSpanFree
+// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
+//
+// When a MSpan is allocated, state == MSpanInUse or MSpanStack
+// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
+
+// Every MSpan is in one doubly-linked list,
+// either one of the MHeap's free lists or one of the
+// MCentral's span lists.
+
+// An MSpan representing actual memory has state _MSpanInUse,
+// _MSpanStack, or _MSpanFree. Transitions between these states are
+// constrained as follows:
+//
+// * A span may transition from free to in-use or stack during any GC
+//   phase.
+//
+// * During sweeping (gcphase == _GCoff), a span may transition from
+//   in-use to free (as a result of sweeping) or stack to free (as a
+//   result of stacks being freed).
+//
+// * During GC (gcphase != _GCoff), a span *must not* transition from
+//   stack or in-use to free. Because concurrent GC may read a pointer
+//   and then look up its span, the span state must be monotonic.
+type mSpanState uint8
+
+const (
+	_MSpanDead  mSpanState = iota
+	_MSpanInUse            // allocated for garbage collected heap
+	_MSpanStack            // allocated for use by stack allocator
+	_MSpanFree
+)
+
+// mSpanStateNames are the names of the span states, indexed by
+// mSpanState.
+var mSpanStateNames = []string{
+	"_MSpanDead",
+	"_MSpanInUse",
+	"_MSpanStack",
+	"_MSpanFree",
+}
+
+// mSpanList heads a linked list of spans.
+//
+//go:notinheap
+type mSpanList struct {
+	first *mspan // first span in list, or nil if none
+	last  *mspan // last span in list, or nil if none
+}
+
+//go:notinheap
+type mspan struct {
+	next *mspan     // next span in list, or nil if none
+	prev *mspan     // previous span in list, or nil if none
+	list *mSpanList // For debugging. TODO: Remove.
+
+	startAddr     uintptr   // address of first byte of span aka s.base()
+	npages        uintptr   // number of pages in span
+	stackfreelist gclinkptr // list of free stacks, avoids overloading freelist
+
+	// freeindex is the slot index between 0 and nelems at which to begin scanning
+	// for the next free object in this span.
+	// Each allocation scans allocBits starting at freeindex until it encounters a 0
+	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
+	// just past the the newly discovered free object.
+	//
+	// If freeindex == nelem, this span has no free objects.
+	//
+	// allocBits is a bitmap of objects in this span.
+	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
+	// then object n is free;
+	// otherwise, object n is allocated. Bits starting at nelem are
+	// undefined and should never be referenced.
+	//
+	// Object n starts at address n*elemsize + (start << pageShift).
+	freeindex uintptr
+	// TODO: Look up nelems from sizeclass and remove this field if it
+	// helps performance.
+	nelems uintptr // number of object in the span.
+
+	// Cache of the allocBits at freeindex. allocCache is shifted
+	// such that the lowest bit corresponds to the bit freeindex.
+	// allocCache holds the complement of allocBits, thus allowing
+	// ctz (count trailing zero) to use it directly.
+	// allocCache may contain bits beyond s.nelems; the caller must ignore
+	// these.
+	allocCache uint64
+
+	// allocBits and gcmarkBits hold pointers to a span's mark and
+	// allocation bits. The pointers are 8 byte aligned.
+	// There are three arenas where this data is held.
+	// free: Dirty arenas that are no longer accessed
+	//       and can be reused.
+	// next: Holds information to be used in the next GC cycle.
+	// current: Information being used during this GC cycle.
+	// previous: Information being used during the last GC cycle.
+	// A new GC cycle starts with the call to finishsweep_m.
+	// finishsweep_m moves the previous arena to the free arena,
+	// the current arena to the previous arena, and
+	// the next arena to the current arena.
+	// The next arena is populated as the spans request
+	// memory to hold gcmarkBits for the next GC cycle as well
+	// as allocBits for newly allocated spans.
+	//
+	// The pointer arithmetic is done "by hand" instead of using
+	// arrays to avoid bounds checks along critical performance
+	// paths.
+	// The sweep will free the old allocBits and set allocBits to the
+	// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
+	// out memory.
+	allocBits  *uint8
+	gcmarkBits *uint8
+
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// h->sweepgen is incremented by 2 after every GC
+
+	sweepgen    uint32
+	divMul      uint16     // for divide by elemsize - divMagic.mul
+	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
+	allocCount  uint16     // capacity - number of objects in freelist
+	sizeclass   uint8      // size class
+	incache     bool       // being used by an mcache
+	state       mSpanState // mspaninuse etc
+	needzero    uint8      // needs to be zeroed before allocation
+	divShift    uint8      // for divide by elemsize - divMagic.shift
+	divShift2   uint8      // for divide by elemsize - divMagic.shift2
+	elemsize    uintptr    // computed from sizeclass or from npages
+	unusedsince int64      // first time spotted by gc in mspanfree state
+	npreleased  uintptr    // number of pages released to the os
+	limit       uintptr    // end of data in span
+	speciallock mutex      // guards specials list
+	specials    *special   // linked list of special records sorted by offset.
+}
+
+func (s *mspan) base() uintptr {
+	return s.startAddr
+}
+
+func (s *mspan) layout() (size, n, total uintptr) {
+	total = s.npages << _PageShift
+	size = s.elemsize
+	if size > 0 {
+		n = total / size
+	}
+	return
+}
+
+func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
+	h := (*mheap)(vh)
+	s := (*mspan)(p)
+	if len(h.allspans) >= cap(h.allspans) {
+		n := 64 * 1024 / sys.PtrSize
+		if n < cap(h.allspans)*3/2 {
+			n = cap(h.allspans) * 3 / 2
+		}
+		var new []*mspan
+		sp := (*slice)(unsafe.Pointer(&new))
+		sp.array = sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys)
+		if sp.array == nil {
+			throw("runtime: cannot allocate memory")
+		}
+		sp.len = len(h.allspans)
+		sp.cap = n
+		if len(h.allspans) > 0 {
+			copy(new, h.allspans)
+		}
+		oldAllspans := h.allspans
+		h.allspans = new
+		if len(oldAllspans) != 0 {
+			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
+		}
+	}
+	h.allspans = append(h.allspans, s)
+}
+
+// inheap reports whether b is a pointer into a (potentially dead) heap object.
+// It returns false for pointers into stack spans.
+// Non-preemptible because it is used by write barriers.
+//go:nowritebarrier
+//go:nosplit
+func inheap(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	s := mheap_.spans[(b-mheap_.arena_start)>>_PageShift]
+	if s == nil || b < s.base() || b >= s.limit || s.state != mSpanInUse {
+		return false
+	}
+	return true
+}
+
+// inHeapOrStack is a variant of inheap that returns true for pointers into stack spans.
+//go:nowritebarrier
+//go:nosplit
+func inHeapOrStack(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	s := mheap_.spans[(b-mheap_.arena_start)>>_PageShift]
+	if s == nil || b < s.base() {
+		return false
+	}
+	switch s.state {
+	case mSpanInUse:
+		return b < s.limit
+	case _MSpanStack:
+		return b < s.base()+s.npages<<_PageShift
+	default:
+		return false
+	}
+}
+
+// TODO: spanOf and spanOfUnchecked are open-coded in a lot of places.
+// Use the functions instead.
+
+// spanOf returns the span of p. If p does not point into the heap or
+// no span contains p, spanOf returns nil.
+func spanOf(p uintptr) *mspan {
+	if p == 0 || p < mheap_.arena_start || p >= mheap_.arena_used {
+		return nil
+	}
+	return spanOfUnchecked(p)
+}
+
+// spanOfUnchecked is equivalent to spanOf, but the caller must ensure
+// that p points into the heap (that is, mheap_.arena_start <= p <
+// mheap_.arena_used).
+func spanOfUnchecked(p uintptr) *mspan {
+	return mheap_.spans[(p-mheap_.arena_start)>>_PageShift]
+}
+
+func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
+	_g_ := getg()
+
+	_g_.m.mcache.local_nlookup++
+	if sys.PtrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(_g_.m.mcache)
+		unlock(&mheap_.lock)
+	}
+
+	s := mheap_.lookupMaybe(unsafe.Pointer(v))
+	if sp != nil {
+		*sp = s
+	}
+	if s == nil {
+		if base != nil {
+			*base = 0
+		}
+		if size != nil {
+			*size = 0
+		}
+		return 0
+	}
+
+	p := s.base()
+	if s.sizeclass == 0 {
+		// Large object.
+		if base != nil {
+			*base = p
+		}
+		if size != nil {
+			*size = s.npages << _PageShift
+		}
+		return 1
+	}
+
+	n := s.elemsize
+	if base != nil {
+		i := (v - p) / n
+		*base = p + i*n
+	}
+	if size != nil {
+		*size = n
+	}
+
+	return 1
+}
+
+// Initialize the heap.
+func (h *mheap) init(spansStart, spansBytes uintptr) {
+	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
+	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
+	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
+	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+
+	// Don't zero mspan allocations. Background sweeping can
+	// inspect a span concurrently with allocating it, so it's
+	// important that the span's sweepgen survive across freeing
+	// and re-allocating a span to prevent background sweeping
+	// from improperly cas'ing it from 0.
+	//
+	// This is safe because mspan contains no heap pointers.
+	h.spanalloc.zero = false
+
+	// h->mapcache needs no init
+	for i := range h.free {
+		h.free[i].init()
+		h.busy[i].init()
+	}
+
+	h.freelarge.init()
+	h.busylarge.init()
+	for i := range h.central {
+		h.central[i].mcentral.init(int32(i))
+	}
+
+	sp := (*slice)(unsafe.Pointer(&h.spans))
+	sp.array = unsafe.Pointer(spansStart)
+	sp.len = 0
+	sp.cap = int(spansBytes / sys.PtrSize)
+}
+
+// mHeap_MapSpans makes sure that the spans are mapped
+// up to the new value of arena_used.
+//
+// It must be called with the expected new value of arena_used,
+// *before* h.arena_used has been updated.
+// Waiting to update arena_used until after the memory has been mapped
+// avoids faults when other threads try access the bitmap immediately
+// after observing the change to arena_used.
+func (h *mheap) mapSpans(arena_used uintptr) {
+	// Map spans array, PageSize at a time.
+	n := arena_used
+	n -= h.arena_start
+	n = n / _PageSize * sys.PtrSize
+	n = round(n, physPageSize)
+	need := n / unsafe.Sizeof(h.spans[0])
+	have := uintptr(len(h.spans))
+	if have >= need {
+		return
+	}
+	h.spans = h.spans[:need]
+	sysMap(unsafe.Pointer(&h.spans[have]), (need-have)*unsafe.Sizeof(h.spans[0]), h.arena_reserved, &memstats.other_sys)
+}
+
+// Sweeps spans in list until reclaims at least npages into heap.
+// Returns the actual number of pages reclaimed.
+func (h *mheap) reclaimList(list *mSpanList, npages uintptr) uintptr {
+	n := uintptr(0)
+	sg := mheap_.sweepgen
+retry:
+	for s := list.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			list.remove(s)
+			// swept spans are at the end of the list
+			list.insertBack(s)
+			unlock(&h.lock)
+			snpages := s.npages
+			if s.sweep(false) {
+				n += snpages
+			}
+			lock(&h.lock)
+			if n >= npages {
+				return n
+			}
+			// the span could have been moved elsewhere
+			goto retry
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being sweept by background sweeper, skip
+			continue
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break
+	}
+	return n
+}
+
+// Sweeps and reclaims at least npage pages into heap.
+// Called before allocating npage pages.
+func (h *mheap) reclaim(npage uintptr) {
+	// First try to sweep busy spans with large objects of size >= npage,
+	// this has good chances of reclaiming the necessary space.
+	for i := int(npage); i < len(h.busy); i++ {
+		if h.reclaimList(&h.busy[i], npage) != 0 {
+			return // Bingo!
+		}
+	}
+
+	// Then -- even larger objects.
+	if h.reclaimList(&h.busylarge, npage) != 0 {
+		return // Bingo!
+	}
+
+	// Now try smaller objects.
+	// One such object is not enough, so we need to reclaim several of them.
+	reclaimed := uintptr(0)
+	for i := 0; i < int(npage) && i < len(h.busy); i++ {
+		reclaimed += h.reclaimList(&h.busy[i], npage-reclaimed)
+		if reclaimed >= npage {
+			return
+		}
+	}
+
+	// Now sweep everything that is not yet swept.
+	unlock(&h.lock)
+	for {
+		n := sweepone()
+		if n == ^uintptr(0) { // all spans are swept
+			break
+		}
+		reclaimed += n
+		if reclaimed >= npage {
+			break
+		}
+	}
+	lock(&h.lock)
+}
+
+// Allocate a new span of npage pages from the heap for GC'd memory
+// and record its size class in the HeapMap and HeapMapCache.
+func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
+	_g_ := getg()
+	lock(&h.lock)
+
+	// To prevent excessive heap growth, before allocating n pages
+	// we need to sweep and reclaim at least n pages.
+	if h.sweepdone == 0 {
+		// TODO(austin): This tends to sweep a large number of
+		// spans in order to find a few completely free spans
+		// (for example, in the garbage benchmark, this sweeps
+		// ~30x the number of pages its trying to allocate).
+		// If GC kept a bit for whether there were any marks
+		// in a span, we could release these free spans
+		// at the end of GC and eliminate this entirely.
+		h.reclaim(npage)
+	}
+
+	// transfer stats from cache to global
+	memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
+	_g_.m.mcache.local_scan = 0
+	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
+	_g_.m.mcache.local_tinyallocs = 0
+
+	s := h.allocSpanLocked(npage)
+	if s != nil {
+		// Record span info, because gc needs to be
+		// able to map interior pointer to containing span.
+		atomic.Store(&s.sweepgen, h.sweepgen)
+		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
+		s.state = _MSpanInUse
+		s.allocCount = 0
+		s.sizeclass = uint8(sizeclass)
+		if sizeclass == 0 {
+			s.elemsize = s.npages << _PageShift
+			s.divShift = 0
+			s.divMul = 0
+			s.divShift2 = 0
+			s.baseMask = 0
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+			m := &class_to_divmagic[sizeclass]
+			s.divShift = m.shift
+			s.divMul = m.mul
+			s.divShift2 = m.shift2
+			s.baseMask = m.baseMask
+		}
+
+		// update stats, sweep lists
+		h.pagesInUse += uint64(npage)
+		if large {
+			memstats.heap_objects++
+			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
+			// Swept spans are at the end of lists.
+			if s.npages < uintptr(len(h.free)) {
+				h.busy[s.npages].insertBack(s)
+			} else {
+				h.busylarge.insertBack(s)
+			}
+		}
+	}
+	// heap_scan and heap_live were updated.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	if trace.enabled {
+		traceHeapAlloc()
+	}
+
+	// h.spans is accessed concurrently without synchronization
+	// from other threads. Hence, there must be a store/store
+	// barrier here to ensure the writes to h.spans above happen
+	// before the caller can publish a pointer p to an object
+	// allocated from s. As soon as this happens, the garbage
+	// collector running on another processor could read p and
+	// look up s in h.spans. The unlock acts as the barrier to
+	// order these writes. On the read side, the data dependency
+	// between p and the index in h.spans orders the reads.
+	unlock(&h.lock)
+	return s
+}
+
+func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
+	// Don't do any operations that lock the heap on the G stack.
+	// It might trigger stack growth, and the stack growth code needs
+	// to be able to allocate heap.
+	var s *mspan
+	systemstack(func() {
+		s = h.alloc_m(npage, sizeclass, large)
+	})
+
+	if s != nil {
+		if needzero && s.needzero != 0 {
+			memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		}
+		s.needzero = 0
+	}
+	return s
+}
+
+func (h *mheap) allocStack(npage uintptr) *mspan {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		throw("mheap_allocstack not on g0 stack")
+	}
+	lock(&h.lock)
+	s := h.allocSpanLocked(npage)
+	if s != nil {
+		s.state = _MSpanStack
+		s.stackfreelist = 0
+		s.allocCount = 0
+		memstats.stacks_inuse += uint64(s.npages << _PageShift)
+	}
+
+	// This unlock acts as a release barrier. See mHeap_Alloc_m.
+	unlock(&h.lock)
+	return s
+}
+
+// Allocates a span of the given size.  h must be locked.
+// The returned span has been removed from the
+// free list, but its state is still MSpanFree.
+func (h *mheap) allocSpanLocked(npage uintptr) *mspan {
+	var list *mSpanList
+	var s *mspan
+
+	// Try in fixed-size lists up to max.
+	for i := int(npage); i < len(h.free); i++ {
+		list = &h.free[i]
+		if !list.isEmpty() {
+			s = list.first
+			goto HaveSpan
+		}
+	}
+
+	// Best fit in list of large spans.
+	list = &h.freelarge
+	s = h.allocLarge(npage)
+	if s == nil {
+		if !h.grow(npage) {
+			return nil
+		}
+		s = h.allocLarge(npage)
+		if s == nil {
+			return nil
+		}
+	}
+
+HaveSpan:
+	// Mark span in use.
+	if s.state != _MSpanFree {
+		throw("MHeap_AllocLocked - MSpan not free")
+	}
+	if s.npages < npage {
+		throw("MHeap_AllocLocked - bad npages")
+	}
+	list.remove(s)
+	if s.inList() {
+		throw("still in list")
+	}
+	if s.npreleased > 0 {
+		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		memstats.heap_released -= uint64(s.npreleased << _PageShift)
+		s.npreleased = 0
+	}
+
+	if s.npages > npage {
+		// Trim extra and put it back in the heap.
+		t := (*mspan)(h.spanalloc.alloc())
+		t.init(s.base()+npage<<_PageShift, s.npages-npage)
+		s.npages = npage
+		p := (t.base() - h.arena_start) >> _PageShift
+		if p > 0 {
+			h.spans[p-1] = s
+		}
+		h.spans[p] = t
+		h.spans[p+t.npages-1] = t
+		t.needzero = s.needzero
+		s.state = _MSpanStack // prevent coalescing with s
+		t.state = _MSpanStack
+		h.freeSpanLocked(t, false, false, s.unusedsince)
+		s.state = _MSpanFree
+	}
+	s.unusedsince = 0
+
+	p := (s.base() - h.arena_start) >> _PageShift
+	for n := uintptr(0); n < npage; n++ {
+		h.spans[p+n] = s
+	}
+
+	memstats.heap_inuse += uint64(npage << _PageShift)
+	memstats.heap_idle -= uint64(npage << _PageShift)
+
+	//println("spanalloc", hex(s.start<<_PageShift))
+	if s.inList() {
+		throw("still in list")
+	}
+	return s
+}
+
+// Allocate a span of exactly npage pages from the list of large spans.
+func (h *mheap) allocLarge(npage uintptr) *mspan {
+	return bestFit(&h.freelarge, npage, nil)
+}
+
+// Search list for smallest span with >= npage pages.
+// If there are multiple smallest spans, take the one
+// with the earliest starting address.
+func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan {
+	for s := list.first; s != nil; s = s.next {
+		if s.npages < npage {
+			continue
+		}
+		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) {
+			best = s
+		}
+	}
+	return best
+}
+
+// Try to add at least npage pages of memory to the heap,
+// returning whether it worked.
+//
+// h must be locked.
+func (h *mheap) grow(npage uintptr) bool {
+	// Ask for a big chunk, to reduce the number of mappings
+	// the operating system needs to track; also amortizes
+	// the overhead of an operating system mapping.
+	// Allocate a multiple of 64kB.
+	npage = round(npage, (64<<10)/_PageSize)
+	ask := npage << _PageShift
+	if ask < _HeapAllocChunk {
+		ask = _HeapAllocChunk
+	}
+
+	v := h.sysAlloc(ask)
+	if v == nil {
+		if ask > npage<<_PageShift {
+			ask = npage << _PageShift
+			v = h.sysAlloc(ask)
+		}
+		if v == nil {
+			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
+			return false
+		}
+	}
+
+	// Create a fake "in use" span and free it, so that the
+	// right coalescing happens.
+	s := (*mspan)(h.spanalloc.alloc())
+	s.init(uintptr(v), ask>>_PageShift)
+	p := (s.base() - h.arena_start) >> _PageShift
+	for i := p; i < p+s.npages; i++ {
+		h.spans[i] = s
+	}
+	atomic.Store(&s.sweepgen, h.sweepgen)
+	s.state = _MSpanInUse
+	h.pagesInUse += uint64(s.npages)
+	h.freeSpanLocked(s, false, true, 0)
+	return true
+}
+
+// Look up the span at the given address.
+// Address is guaranteed to be in map
+// and is guaranteed to be start or end of span.
+func (h *mheap) lookup(v unsafe.Pointer) *mspan {
+	p := uintptr(v)
+	p -= h.arena_start
+	return h.spans[p>>_PageShift]
+}
+
+// Look up the span at the given address.
+// Address is *not* guaranteed to be in map
+// and may be anywhere in the span.
+// Map entries for the middle of a span are only
+// valid for allocated spans. Free spans may have
+// other garbage in their middles, so we have to
+// check for that.
+func (h *mheap) lookupMaybe(v unsafe.Pointer) *mspan {
+	if uintptr(v) < h.arena_start || uintptr(v) >= h.arena_used {
+		return nil
+	}
+	s := h.spans[(uintptr(v)-h.arena_start)>>_PageShift]
+	if s == nil || uintptr(v) < s.base() || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse {
+		return nil
+	}
+	return s
+}
+
+// Free the span back into the heap.
+func (h *mheap) freeSpan(s *mspan, acct int32) {
+	systemstack(func() {
+		mp := getg().m
+		lock(&h.lock)
+		memstats.heap_scan += uint64(mp.mcache.local_scan)
+		mp.mcache.local_scan = 0
+		memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
+		mp.mcache.local_tinyallocs = 0
+		if msanenabled {
+			// Tell msan that this entire span is no longer in use.
+			base := unsafe.Pointer(s.base())
+			bytes := s.npages << _PageShift
+			msanfree(base, bytes)
+		}
+		if acct != 0 {
+			memstats.heap_objects--
+		}
+		if gcBlackenEnabled != 0 {
+			// heap_scan changed.
+			gcController.revise()
+		}
+		h.freeSpanLocked(s, true, true, 0)
+		unlock(&h.lock)
+	})
+}
+
+func (h *mheap) freeStack(s *mspan) {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		throw("mheap_freestack not on g0 stack")
+	}
+	s.needzero = 1
+	lock(&h.lock)
+	memstats.stacks_inuse -= uint64(s.npages << _PageShift)
+	h.freeSpanLocked(s, true, true, 0)
+	unlock(&h.lock)
+}
+
+// s must be on a busy list (h.busy or h.busylarge) or unlinked.
+func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) {
+	switch s.state {
+	case _MSpanStack:
+		if s.allocCount != 0 {
+			throw("MHeap_FreeSpanLocked - invalid stack free")
+		}
+	case _MSpanInUse:
+		if s.allocCount != 0 || s.sweepgen != h.sweepgen {
+			print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
+			throw("MHeap_FreeSpanLocked - invalid free")
+		}
+		h.pagesInUse -= uint64(s.npages)
+	default:
+		throw("MHeap_FreeSpanLocked - invalid span state")
+	}
+
+	if acctinuse {
+		memstats.heap_inuse -= uint64(s.npages << _PageShift)
+	}
+	if acctidle {
+		memstats.heap_idle += uint64(s.npages << _PageShift)
+	}
+	s.state = _MSpanFree
+	if s.inList() {
+		h.busyList(s.npages).remove(s)
+	}
+
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	s.unusedsince = unusedsince
+	if unusedsince == 0 {
+		s.unusedsince = nanotime()
+	}
+	s.npreleased = 0
+
+	// Coalesce with earlier, later spans.
+	p := (s.base() - h.arena_start) >> _PageShift
+	if p > 0 {
+		t := h.spans[p-1]
+		if t != nil && t.state == _MSpanFree {
+			s.startAddr = t.startAddr
+			s.npages += t.npages
+			s.npreleased = t.npreleased // absorb released pages
+			s.needzero |= t.needzero
+			p -= t.npages
+			h.spans[p] = s
+			h.freeList(t.npages).remove(t)
+			t.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(t))
+		}
+	}
+	if (p + s.npages) < uintptr(len(h.spans)) {
+		t := h.spans[p+s.npages]
+		if t != nil && t.state == _MSpanFree {
+			s.npages += t.npages
+			s.npreleased += t.npreleased
+			s.needzero |= t.needzero
+			h.spans[p+s.npages-1] = s
+			h.freeList(t.npages).remove(t)
+			t.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(t))
+		}
+	}
+
+	// Insert s into appropriate list.
+	h.freeList(s.npages).insert(s)
+}
+
+func (h *mheap) freeList(npages uintptr) *mSpanList {
+	if npages < uintptr(len(h.free)) {
+		return &h.free[npages]
+	}
+	return &h.freelarge
+}
+
+func (h *mheap) busyList(npages uintptr) *mSpanList {
+	if npages < uintptr(len(h.free)) {
+		return &h.busy[npages]
+	}
+	return &h.busylarge
+}
+
+func scavengelist(list *mSpanList, now, limit uint64) uintptr {
+	if list.isEmpty() {
+		return 0
+	}
+
+	var sumreleased uintptr
+	for s := list.first; s != nil; s = s.next {
+		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
+			start := s.base()
+			end := start + s.npages<<_PageShift
+			if physPageSize > _PageSize {
+				// We can only release pages in
+				// physPageSize blocks, so round start
+				// and end in. (Otherwise, madvise
+				// will round them *out* and release
+				// more memory than we want.)
+				start = (start + physPageSize - 1) &^ (physPageSize - 1)
+				end &^= physPageSize - 1
+				if end <= start {
+					// start and end don't span a
+					// whole physical page.
+					continue
+				}
+			}
+			len := end - start
+
+			released := len - (s.npreleased << _PageShift)
+			if physPageSize > _PageSize && released == 0 {
+				continue
+			}
+			memstats.heap_released += uint64(released)
+			sumreleased += released
+			s.npreleased = len >> _PageShift
+			sysUnused(unsafe.Pointer(start), len)
+		}
+	}
+	return sumreleased
+}
+
+func (h *mheap) scavenge(k int32, now, limit uint64) {
+	lock(&h.lock)
+	var sumreleased uintptr
+	for i := 0; i < len(h.free); i++ {
+		sumreleased += scavengelist(&h.free[i], now, limit)
+	}
+	sumreleased += scavengelist(&h.freelarge, now, limit)
+	unlock(&h.lock)
+
+	if debug.gctrace > 0 {
+		if sumreleased > 0 {
+			print("scvg", k, ": ", sumreleased>>20, " MB released\n")
+		}
+		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
+		// But we can't call ReadMemStats on g0 holding locks.
+		print("scvg", k, ": inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+	}
+}
+
+//go:linkname runtime_debug_freeOSMemory runtime_debug.freeOSMemory
+func runtime_debug_freeOSMemory() {
+	gcStart(gcForceBlockMode, false)
+	systemstack(func() { mheap_.scavenge(-1, ^uint64(0), 0) })
+}
+
+// Initialize a new span with the given start and npages.
+func (span *mspan) init(base uintptr, npages uintptr) {
+	// span is *not* zeroed.
+	span.next = nil
+	span.prev = nil
+	span.list = nil
+	span.startAddr = base
+	span.npages = npages
+	span.allocCount = 0
+	span.sizeclass = 0
+	span.incache = false
+	span.elemsize = 0
+	span.state = _MSpanDead
+	span.unusedsince = 0
+	span.npreleased = 0
+	span.speciallock.key = 0
+	span.specials = nil
+	span.needzero = 0
+	span.freeindex = 0
+	span.allocBits = nil
+	span.gcmarkBits = nil
+}
+
+func (span *mspan) inList() bool {
+	return span.list != nil
+}
+
+// Initialize an empty doubly-linked list.
+func (list *mSpanList) init() {
+	list.first = nil
+	list.last = nil
+}
+
+func (list *mSpanList) remove(span *mspan) {
+	if span.list != list {
+		println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
+		throw("MSpanList_Remove")
+	}
+	if list.first == span {
+		list.first = span.next
+	} else {
+		span.prev.next = span.next
+	}
+	if list.last == span {
+		list.last = span.prev
+	} else {
+		span.next.prev = span.prev
+	}
+	span.next = nil
+	span.prev = nil
+	span.list = nil
+}
+
+func (list *mSpanList) isEmpty() bool {
+	return list.first == nil
+}
+
+func (list *mSpanList) insert(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list)
+		throw("MSpanList_Insert")
+	}
+	span.next = list.first
+	if list.first != nil {
+		// The list contains at least one span; link it in.
+		// The last span in the list doesn't change.
+		list.first.prev = span
+	} else {
+		// The list contains no spans, so this is also the last span.
+		list.last = span
+	}
+	list.first = span
+	span.list = list
+}
+
+func (list *mSpanList) insertBack(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
+		throw("MSpanList_InsertBack")
+	}
+	span.prev = list.last
+	if list.last != nil {
+		// The list contains at least one span.
+		list.last.next = span
+	} else {
+		// The list contains no spans, so this is also the first span.
+		list.first = span
+	}
+	list.last = span
+	span.list = list
+}
+
+const (
+	_KindSpecialFinalizer = 1
+	_KindSpecialProfile   = 2
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+)
+
+//go:notinheap
+type special struct {
+	next   *special // linked list in span
+	offset uint16   // span offset of object
+	kind   byte     // kind of special
+}
+
+// Adds the special record s to the list of special records for
+// the object p. All fields of s should be filled in except for
+// offset & next, which this routine will fill in.
+// Returns true if the special was successfully added, false otherwise.
+// (The add will fail only if a record with the same p and s->kind
+//  already exists.)
+func addspecial(p unsafe.Pointer, s *special) bool {
+	span := mheap_.lookupMaybe(p)
+	if span == nil {
+		throw("addspecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
+	mp := acquirem()
+	span.ensureSwept()
+
+	offset := uintptr(p) - span.base()
+	kind := s.kind
+
+	lock(&span.speciallock)
+
+	// Find splice point, check for existing record.
+	t := &span.specials
+	for {
+		x := *t
+		if x == nil {
+			break
+		}
+		if offset == uintptr(x.offset) && kind == x.kind {
+			unlock(&span.speciallock)
+			releasem(mp)
+			return false // already exists
+		}
+		if offset < uintptr(x.offset) || (offset == uintptr(x.offset) && kind < x.kind) {
+			break
+		}
+		t = &x.next
+	}
+
+	// Splice in record, fill in offset.
+	s.offset = uint16(offset)
+	s.next = *t
+	*t = s
+	unlock(&span.speciallock)
+	releasem(mp)
+
+	return true
+}
+
+// Removes the Special record of the given kind for the object p.
+// Returns the record if the record existed, nil otherwise.
+// The caller must FixAlloc_Free the result.
+func removespecial(p unsafe.Pointer, kind uint8) *special {
+	span := mheap_.lookupMaybe(p)
+	if span == nil {
+		throw("removespecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
+	mp := acquirem()
+	span.ensureSwept()
+
+	offset := uintptr(p) - span.base()
+
+	lock(&span.speciallock)
+	t := &span.specials
+	for {
+		s := *t
+		if s == nil {
+			break
+		}
+		// This function is used for finalizers only, so we don't check for
+		// "interior" specials (p must be exactly equal to s->offset).
+		if offset == uintptr(s.offset) && kind == s.kind {
+			*t = s.next
+			unlock(&span.speciallock)
+			releasem(mp)
+			return s
+		}
+		t = &s.next
+	}
+	unlock(&span.speciallock)
+	releasem(mp)
+	return nil
+}
+
+// The described object has a finalizer set for it.
+//
+// specialfinalizer is allocated from non-GC'd memory, so any heap
+// pointers must be specially handled.
+//
+//go:notinheap
+type specialfinalizer struct {
+	special special
+	fn      *funcval  // May be a heap pointer.
+	ft      *functype // May be a heap pointer, but always live.
+	ot      *ptrtype  // May be a heap pointer, but always live.
+}
+
+// Adds a finalizer to the object p. Returns true if it succeeded.
+func addfinalizer(p unsafe.Pointer, f *funcval, ft *functype, ot *ptrtype) bool {
+	lock(&mheap_.speciallock)
+	s := (*specialfinalizer)(mheap_.specialfinalizeralloc.alloc())
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialFinalizer
+	s.fn = f
+	s.ft = ft
+	s.ot = ot
+	if addspecial(p, &s.special) {
+		// This is responsible for maintaining the same
+		// GC-related invariants as markrootSpans in any
+		// situation where it's possible that markrootSpans
+		// has already run but mark termination hasn't yet.
+		if gcphase != _GCoff {
+			_, base, _ := findObject(p)
+			mp := acquirem()
+			gcw := &mp.p.ptr().gcw
+			// Mark everything reachable from the object
+			// so it's retained for the finalizer.
+			scanobject(uintptr(base), gcw)
+			// Mark the finalizer itself, since the
+			// special isn't part of the GC'd heap.
+			scanblock(uintptr(unsafe.Pointer(&s.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+			if gcBlackenPromptly {
+				gcw.dispose()
+			}
+			releasem(mp)
+		}
+		return true
+	}
+
+	// There was an old finalizer
+	lock(&mheap_.speciallock)
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
+	unlock(&mheap_.speciallock)
+	return false
+}
+
+// Removes the finalizer (if any) from the object p.
+func removefinalizer(p unsafe.Pointer) {
+	s := (*specialfinalizer)(unsafe.Pointer(removespecial(p, _KindSpecialFinalizer)))
+	if s == nil {
+		return // there wasn't a finalizer to remove
+	}
+	lock(&mheap_.speciallock)
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
+	unlock(&mheap_.speciallock)
+}
+
+// The described object is being heap profiled.
+//
+//go:notinheap
+type specialprofile struct {
+	special special
+	b       *bucket
+}
+
+// Set the heap profile bucket associated with addr to b.
+func setprofilebucket(p unsafe.Pointer, b *bucket) {
+	lock(&mheap_.speciallock)
+	s := (*specialprofile)(mheap_.specialprofilealloc.alloc())
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialProfile
+	s.b = b
+	if !addspecial(p, &s.special) {
+		throw("setprofilebucket: profile already set")
+	}
+}
+
+// Do whatever cleanup needs to be done to deallocate s. It has
+// already been unlinked from the MSpan specials list.
+func freespecial(s *special, p unsafe.Pointer, size uintptr) {
+	switch s.kind {
+	case _KindSpecialFinalizer:
+		sf := (*specialfinalizer)(unsafe.Pointer(s))
+		queuefinalizer(p, sf.fn, sf.ft, sf.ot)
+		lock(&mheap_.speciallock)
+		mheap_.specialfinalizeralloc.free(unsafe.Pointer(sf))
+		unlock(&mheap_.speciallock)
+	case _KindSpecialProfile:
+		sp := (*specialprofile)(unsafe.Pointer(s))
+		mProf_Free(sp.b, size)
+		lock(&mheap_.speciallock)
+		mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
+		unlock(&mheap_.speciallock)
+	default:
+		throw("bad special kind")
+		panic("not reached")
+	}
+}
+
+const gcBitsChunkBytes = uintptr(64 << 10)
+const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
+
+type gcBitsHeader struct {
+	free uintptr // free is the index into bits of the next free byte.
+	next uintptr // *gcBits triggers recursive type bug. (issue 14620)
+}
+
+//go:notinheap
+type gcBits struct {
+	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
+	free uintptr // free is the index into bits of the next free byte.
+	next *gcBits
+	bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8
+}
+
+var gcBitsArenas struct {
+	lock     mutex
+	free     *gcBits
+	next     *gcBits
+	current  *gcBits
+	previous *gcBits
+}
+
+// newMarkBits returns a pointer to 8 byte aligned bytes
+// to be used for a span's mark bits.
+func newMarkBits(nelems uintptr) *uint8 {
+	lock(&gcBitsArenas.lock)
+	blocksNeeded := uintptr((nelems + 63) / 64)
+	bytesNeeded := blocksNeeded * 8
+	if gcBitsArenas.next == nil ||
+		gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) {
+		// Allocate a new arena.
+		fresh := newArena()
+		fresh.next = gcBitsArenas.next
+		gcBitsArenas.next = fresh
+	}
+	if gcBitsArenas.next.free >= gcBitsChunkBytes {
+		println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes)
+		throw("markBits overflow")
+	}
+	result := &gcBitsArenas.next.bits[gcBitsArenas.next.free]
+	gcBitsArenas.next.free += bytesNeeded
+	unlock(&gcBitsArenas.lock)
+	return result
+}
+
+// newAllocBits returns a pointer to 8 byte aligned bytes
+// to be used for this span's alloc bits.
+// newAllocBits is used to provide newly initialized spans
+// allocation bits. For spans not being initialized the
+// the mark bits are repurposed as allocation bits when
+// the span is swept.
+func newAllocBits(nelems uintptr) *uint8 {
+	return newMarkBits(nelems)
+}
+
+// nextMarkBitArenaEpoch establishes a new epoch for the arenas
+// holding the mark bits. The arenas are named relative to the
+// current GC cycle which is demarcated by the call to finishweep_m.
+//
+// All current spans have been swept.
+// During that sweep each span allocated room for its gcmarkBits in
+// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current
+// where the GC will mark objects and after each span is swept these bits
+// will be used to allocate objects.
+// gcBitsArenas.current becomes gcBitsArenas.previous where the span's
+// gcAllocBits live until all the spans have been swept during this GC cycle.
+// The span's sweep extinguishes all the references to gcBitsArenas.previous
+// by pointing gcAllocBits into the gcBitsArenas.current.
+// The gcBitsArenas.previous is released to the gcBitsArenas.free list.
+func nextMarkBitArenaEpoch() {
+	lock(&gcBitsArenas.lock)
+	if gcBitsArenas.previous != nil {
+		if gcBitsArenas.free == nil {
+			gcBitsArenas.free = gcBitsArenas.previous
+		} else {
+			// Find end of previous arenas.
+			last := gcBitsArenas.previous
+			for last = gcBitsArenas.previous; last.next != nil; last = last.next {
+			}
+			last.next = gcBitsArenas.free
+			gcBitsArenas.free = gcBitsArenas.previous
+		}
+	}
+	gcBitsArenas.previous = gcBitsArenas.current
+	gcBitsArenas.current = gcBitsArenas.next
+	gcBitsArenas.next = nil // newMarkBits calls newArena when needed
+	unlock(&gcBitsArenas.lock)
+}
+
+// newArena allocates and zeroes a gcBits arena.
+func newArena() *gcBits {
+	var result *gcBits
+	if gcBitsArenas.free == nil {
+		result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		if result == nil {
+			throw("runtime: cannot allocate memory")
+		}
+	} else {
+		result = gcBitsArenas.free
+		gcBitsArenas.free = gcBitsArenas.free.next
+		memclrNoHeapPointers(unsafe.Pointer(result), gcBitsChunkBytes)
+	}
+	result.next = nil
+	// If result.bits is not 8 byte aligned adjust index so
+	// that &result.bits[result.free] is 8 byte aligned.
+	if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 {
+		result.free = 0
+	} else {
+		result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)
+	}
+	return result
+}
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index 1bfdc39..87f84a7 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -12,15 +12,6 @@
 	"unsafe"
 )
 
-// Export temporarily for gccgo's C code to call:
-//go:linkname mProf_Malloc runtime.mProf_Malloc
-//go:linkname mProf_Free runtime.mProf_Free
-//go:linkname mProf_GC runtime.mProf_GC
-//go:linkname tracealloc runtime.tracealloc
-//go:linkname tracefree runtime.tracefree
-//go:linkname tracegc runtime.tracegc
-//go:linkname iterate_memprof runtime.iterate_memprof
-
 // NOTE(rsc): Everything here could use cas if contention became an issue.
 var proflock mutex
 
diff --git a/libgo/go/runtime/msize.go b/libgo/go/runtime/msize.go
new file mode 100644
index 0000000..438c987
--- /dev/null
+++ b/libgo/go/runtime/msize.go
@@ -0,0 +1,47 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc small size classes.
+//
+// See malloc.go for overview.
+// See also mksizeclasses.go for how we decide what size classes to use.
+
+package runtime
+
+// sizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+//	1 <= sizeclass < NumSizeClasses, for n.
+//	Size class 0 is reserved to mean "not small".
+//
+// The sizeToClass lookup is implemented using two arrays,
+// one mapping sizes <= 1024 to their class and one mapping
+// sizes >= 1024 and <= MaxSmallSize to their class.
+// All objects are 8-aligned, so the first array is indexed by
+// the size divided by 8 (rounded up).  Objects >= 1024 bytes
+// are 128-aligned, so the second array is indexed by the
+// size divided by 128 (rounded up).  The arrays are constants
+// in sizeclass.go generated by mksizeclass.go.
+func sizeToClass(size uint32) uint32 {
+	if size > _MaxSmallSize {
+		throw("invalid size")
+	}
+	if size > smallSizeMax-8 {
+		return uint32(size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv])
+	}
+	return uint32(size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv])
+}
+
+// Returns size of the memory block that mallocgc will allocate if you ask for the size.
+func roundupsize(size uintptr) uintptr {
+	if size < _MaxSmallSize {
+		if size <= smallSizeMax-8 {
+			return uintptr(class_to_size[size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]])
+		} else {
+			return uintptr(class_to_size[size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]])
+		}
+	}
+	if size+_PageSize < size {
+		return size
+	}
+	return round(size, _PageSize)
+}
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 178c32c..aa3cfef 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -467,10 +467,7 @@
 // For gccgo this is in runtime/mgc0.c.
 func updatememstats(stats *gcstats)
 
-/*
-For gccgo these are still in runtime/mgc0.c.
-
-//go:linkname readGCStats runtime/debug.readGCStats
+//go:linkname readGCStats runtime_debug.readGCStats
 func readGCStats(pauses *[]uint64) {
 	systemstack(func() {
 		readGCStats_m(pauses)
@@ -618,7 +615,6 @@
 		return
 	}
 	c.releaseAll()
-	stackcache_clear(c)
 }
 
 // flushallmcaches flushes the mcaches of all Ps.
@@ -652,8 +648,6 @@
 	}
 }
 
-*/
-
 // Atomically increases a given *system* memory stat. We are counting on this
 // stat never overflowing a uintptr, so this function must only be used for
 // system memory stats.
diff --git a/libgo/go/runtime/os_linux.go b/libgo/go/runtime/os_linux.go
index ad33486..e1a6a30 100644
--- a/libgo/go/runtime/os_linux.go
+++ b/libgo/go/runtime/os_linux.go
@@ -166,6 +166,3 @@
 	}
 	return i / 2
 }
-
-// Temporary for gccgo until we port mem_GOOS.go.
-var addrspace_vec [1]byte
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index eb69140..b28e26b 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -40,10 +40,6 @@
 // Function called by misc/cgo/test.
 //go:linkname lockedOSThread runtime.lockedOSThread
 
-// Functions temporarily in C that have not yet been ported.
-func gchelper()
-func mallocinit()
-
 // C functions for thread and context management.
 func newosproc(*m)
 func malg(bool, bool, *unsafe.Pointer, *uintptr) *g
@@ -454,6 +450,7 @@
 
 	sched.maxmcount = 10000
 
+	tracebackinit()
 	mallocinit()
 	mcommoninit(_g_.m)
 	alginit() // maps must not be used before this call
@@ -769,6 +766,122 @@
 	}
 }
 
+// scang blocks until gp's stack has been scanned.
+// It might be scanned by scang or it might be scanned by the goroutine itself.
+// Either way, the stack scan has completed when scang returns.
+func scang(gp *g, gcw *gcWork) {
+	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
+	// Nothing is racing with us now, but gcscandone might be set to true left over
+	// from an earlier round of stack scanning (we scan twice per GC).
+	// We use gcscandone to record whether the scan has been done during this round.
+	// It is important that the scan happens exactly once: if called twice,
+	// the installation of stack barriers will detect the double scan and die.
+
+	gp.gcscandone = false
+
+	// See http://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 10 * 1000
+	var nextYield int64
+
+	// Endeavor to get gcscandone set to true,
+	// either by doing the stack scan ourselves or by coercing gp to scan itself.
+	// gp.gcscandone can transition from false to true when we're not looking
+	// (if we asked for preemption), so any time we lock the status using
+	// castogscanstatus we have to double-check that the scan is still not done.
+loop:
+	for i := 0; !gp.gcscandone; i++ {
+		switch s := readgstatus(gp); s {
+		default:
+			dumpgstatus(gp)
+			throw("stopg: invalid status")
+
+		case _Gdead:
+			// No stack.
+			gp.gcscandone = true
+			break loop
+
+		case _Gcopystack:
+		// Stack being switched. Go around again.
+
+		case _Grunnable, _Gsyscall, _Gwaiting:
+			// Claim goroutine by setting scan bit.
+			// Racing with execution or readying of gp.
+			// The scan bit keeps them from running
+			// the goroutine until we're done.
+			if castogscanstatus(gp, s, s|_Gscan) {
+				if gp.scanningself {
+					// Don't try to scan the stack
+					// if the goroutine is going to do
+					// it itself.
+					restartg(gp)
+					break
+				}
+				if !gp.gcscandone {
+					scanstack(gp, gcw)
+					gp.gcscandone = true
+				}
+				restartg(gp)
+				break loop
+			}
+
+		case _Gscanwaiting:
+			// newstack is doing a scan for us right now. Wait.
+
+		case _Gscanrunning:
+			// checkPreempt is scanning. Wait.
+
+		case _Grunning:
+			// Goroutine running. Try to preempt execution so it can scan itself.
+			// The preemption handler (in newstack) does the actual scan.
+
+			// Optimization: if there is already a pending preemption request
+			// (from the previous loop iteration), don't bother with the atomics.
+			if gp.preemptscan && gp.preempt {
+				break
+			}
+
+			// Ask for preemption and self scan.
+			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
+				if !gp.gcscandone {
+					gp.preemptscan = true
+					gp.preempt = true
+				}
+				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+			}
+		}
+
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			procyield(10)
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+
+	gp.preemptscan = false // cancel scan request if no longer needed
+}
+
+// The GC requests that this routine be moved from a scanmumble state to a mumble state.
+func restartg(gp *g) {
+	s := readgstatus(gp)
+	switch s {
+	default:
+		dumpgstatus(gp)
+		throw("restartg: unexpected status")
+
+	case _Gdead:
+	// ok
+
+	case _Gscanrunnable,
+		_Gscanwaiting,
+		_Gscansyscall:
+		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+	}
+}
+
 // stopTheWorld stops all P's from executing goroutines, interrupting
 // all goroutines at GC safe points and records reason as the reason
 // for the stop. On return, only the current goroutine's P is running.
@@ -2016,6 +2129,7 @@
 		// goroutines on the global queue.
 		// Since we preempt by storing the goroutine on the global
 		// queue, this is the only place we need to check preempt.
+		// This does not call checkPreempt because gp is not running.
 		if gp != nil && gp.preempt {
 			gp.preempt = false
 			lock(&sched.lock)
@@ -2114,6 +2228,13 @@
 	goschedImpl(gp)
 }
 
+func gopreempt_m(gp *g) {
+	if trace.enabled {
+		traceGoPreempt()
+	}
+	goschedImpl(gp)
+}
+
 // Finishes execution of the current goroutine.
 func goexit1() {
 	if trace.enabled {
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 9100880..dd3f7b2 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -382,7 +382,18 @@
 func parsedebugvars() {
 	// defaults
 	debug.cgocheck = 1
-	debug.invalidptr = 1
+
+	// Unfortunately, because gccgo uses conservative stack scanning,
+	// we can not enable invalid pointer checking. It is possible for
+	// memory block M1 to point to M2, and for both to be dead.
+	// We release M2, causing the entire span to be released.
+	// Before we release M1, a stack pointer appears that point into it.
+	// This stack pointer is presumably dead, but causes M1 to be marked.
+	// We scan M1 and see the pointer to M2 on a released span.
+	// At that point, if debug.invalidptr is set, we crash.
+	// This is not a problem, assuming that M1 really is dead and
+	// the pointer we discovered to it will not be used.
+	// debug.invalidptr = 1
 
 	for p := gogetenv("GODEBUG"); p != ""; {
 		field := ""
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 9c5def2..22847ea 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -413,8 +413,7 @@
 	entryfn  uintptr              // function address passed to __go_go
 	fromgogo bool                 // whether entered from gogo function
 
-	issystem     bool // do not output in stack dump
-	isbackground bool // ignore in deadlock detector
+	scanningself bool // whether goroutine is scanning its own stack
 
 	traceback *tracebackg // stack traceback buffer
 
@@ -542,7 +541,7 @@
 
 	tracebuf traceBufPtr
 
-	// Not for gccgo for now: palloc persistentAlloc // per-P to avoid mutex
+	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
 	gcAssistTime     int64 // Nanoseconds in assistAlloc
@@ -552,7 +551,7 @@
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
-	// Not for gccgo for now: gcw gcWork
+	gcw gcWork
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index 55f4454..f61f85e 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -60,10 +60,7 @@
 		panic(errorString("makeslice: cap out of range"))
 	}
 
-	// gccgo's current garbage collector requires using newarray,
-	// not mallocgc here.  This can change back to mallocgc when
-	// we port the garbage collector.
-	p := newarray(et, cap)
+	p := mallocgc(et.size*uintptr(cap), et, true)
 	return slice{p, len, cap}
 }
 
@@ -144,21 +141,14 @@
 
 	var p unsafe.Pointer
 	if et.kind&kindNoPointers != 0 {
-		// gccgo's current GC requires newarray, not mallocgc.
-		p = newarray(et, newcap)
+		p = mallocgc(capmem, nil, false)
 		memmove(p, old.array, lenmem)
-		// The call to memclr is not needed for gccgo since
-		// the newarray function will zero the memory.
-		// Calling memclr is also wrong since we allocated
-		// newcap*et.size bytes, which is not the same as capmem.
 		// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
 		// Only clear the part that will not be overwritten.
-		// memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
-		_ = newlenmem
+		memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
-		// gccgo's current GC requires newarray, not mallocgc.
-		p = newarray(et, newcap)
+		p = mallocgc(capmem, et, true)
 		if !writeBarrier.enabled {
 			memmove(p, old.array, lenmem)
 		} else {
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 5029446..a3d0918 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -130,9 +130,6 @@
 	return unsafe.Pointer(x ^ 0)
 }
 
-//extern mincore
-func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
-
 //go:noescape
 func jmpdefer(fv *funcval, argp uintptr)
 func exit1(code int32)
@@ -277,14 +274,6 @@
 //extern syscall
 func syscall(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) uintptr
 
-// newobject allocates a new object.
-// For gccgo unless and until we port malloc.go.
-func newobject(*_type) unsafe.Pointer
-
-// newarray allocates a new array of objects.
-// For gccgo unless and until we port malloc.go.
-func newarray(*_type, int) unsafe.Pointer
-
 // For gccgo, to communicate from the C code to the Go code.
 //go:linkname setIsCgo runtime.setIsCgo
 func setIsCgo() {
@@ -303,84 +292,6 @@
 	support_aes = v
 }
 
-// typedmemmove copies a typed value.
-// For gccgo for now.
-//go:linkname typedmemmove runtime.typedmemmove
-//go:nosplit
-func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
-	memmove(dst, src, typ.size)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:linkname reflect_typedmemmove reflect.typedmemmove
-func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
-	typedmemmove(typ, dst, src)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:nosplit
-func typedmemclr(typ *_type, ptr unsafe.Pointer) {
-	memclrNoHeapPointers(ptr, typ.size)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:nosplit
-func memclrHasPointers(ptr unsafe.Pointer, n uintptr) {
-	memclrNoHeapPointers(ptr, n)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:linkname typedslicecopy runtime.typedslicecopy
-func typedslicecopy(typ *_type, dst, src slice) int {
-	n := dst.len
-	if n > src.len {
-		n = src.len
-	}
-	if n == 0 {
-		return 0
-	}
-	memmove(dst.array, src.array, uintptr(n)*typ.size)
-	return n
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:linkname reflect_typedslicecopy reflect.typedslicecopy
-func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
-	return typedslicecopy(elemType, dst, src)
-}
-
-// Here for gccgo until we port malloc.go.
-const (
-	_64bit              = 1 << (^uintptr(0) >> 63) / 2
-	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*32
-	_MaxMem             = uintptr(1<<_MHeapMap_TotalBits - 1)
-	_MaxGcproc          = 32
-)
-
-// Here for gccgo until we port malloc.go.
-//extern runtime_mallocgc
-func c_mallocgc(size uintptr, typ uintptr, flag uint32) unsafe.Pointer
-func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
-	flag := uint32(0)
-	if !needzero {
-		flag = 1 << 3
-	}
-	return c_mallocgc(size, uintptr(unsafe.Pointer(typ)), flag)
-}
-
-// Here for gccgo until we port mgc.go.
-var writeBarrier struct {
-	enabled bool    // compiler emits a check of this before calling write barrier
-	pad     [3]byte // compiler uses 32-bit load for "enabled" field
-	needed  bool    // whether we need a write barrier for current GC phase
-	cgo     bool    // whether we need a write barrier for a cgo check
-	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
-}
-
-func queueRescan(gp *g) {
-	gp.gcscanvalid = false
-}
-
 // Here for gccgo until we port atomic_pointer.go and mgc.go.
 //go:nosplit
 func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
@@ -394,19 +305,13 @@
 func lock(l *mutex)
 func unlock(l *mutex)
 
-// Here for gccgo for netpoll and Solaris.
+// Here for gccgo.
 func errno() int
 
 // Temporary for gccgo until we port proc.go.
 func entersyscall(int32)
 func entersyscallblock(int32)
 
-// Temporary hack for gccgo until we port the garbage collector.
-func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {}
-
-// Here for gccgo until we port msize.go.
-func roundupsize(uintptr) uintptr
-
 // Here for gccgo until we port mgc.go.
 func GC()
 
@@ -439,34 +344,12 @@
 // Temporary for gccgo until we port mheap.go
 func setprofilebucket(p unsafe.Pointer, b *bucket)
 
-// Temporary for gccgo until we port mgc.go.
-func setgcpercent(int32) int32
-
-//go:linkname setGCPercent runtime_debug.setGCPercent
-func setGCPercent(in int32) (out int32) {
-	return setgcpercent(in)
-}
-
 // Temporary for gccgo until we port atomic_pointer.go.
 //go:nosplit
 func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
 	atomic.StorepNoWB(noescape(ptr), new)
 }
 
-// Temporary for gccgo until we port mbarrier.go
-//go:linkname writebarrierptr runtime.writebarrierptr
-func writebarrierptr(dst *uintptr, src uintptr) {
-	*dst = src
-}
-
-// Temporary for gccgo until we port malloc.go
-var zerobase uintptr
-
-//go:linkname getZerobase runtime.getZerobase
-func getZerobase() *uintptr {
-	return &zerobase
-}
-
 // Get signal trampoline, written in C.
 func getSigtramp() uintptr
 
@@ -539,81 +422,12 @@
 	return panicking
 }
 
-// Temporary for gccgo until we port mcache.go.
-func allocmcache() *mcache
-func freemcache(*mcache)
-
-// Temporary for gccgo until we port mgc.go.
-// This is just so that allgadd will compile.
-var work struct {
-	rescan struct {
-		lock mutex
-		list []guintptr
-	}
-}
-
-// Temporary for gccgo until we port mgc.go.
-var gcBlackenEnabled uint32
-
-// Temporary for gccgo until we port mgc.go.
-func gcMarkWorkAvailable(p *p) bool {
-	return false
-}
-
-// Temporary for gccgo until we port mgc.go.
-var gcController gcControllerState
-
-// Temporary for gccgo until we port mgc.go.
-type gcControllerState struct {
-}
-
-// Temporary for gccgo until we port mgc.go.
-func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
-	return nil
-}
-
-// Temporary for gccgo until we port mgc.go.
-var gcphase uint32
-
-// Temporary for gccgo until we port mgc.go.
-const (
-	_GCoff = iota
-	_GCmark
-	_GCmarktermination
-)
-
-// Temporary for gccgo until we port mgc.go.
-type gcMarkWorkerMode int
-
-// Temporary for gccgo until we port mgc.go.
-const (
-	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
-	gcMarkWorkerFractionalMode
-	gcMarkWorkerIdleMode
-)
-
-// Temporary for gccgo until we port mheap.go.
-type mheap struct {
-}
-
-// Temporary for gccgo until we port mheap.go.
-var mheap_ mheap
-
-// Temporary for gccgo until we port mheap.go.
-func scavenge(int32, uint64, uint64)
-func (h *mheap) scavenge(k int32, now, limit uint64) {
-	scavenge(k, now, limit)
-}
-
 // Temporary for gccgo until we initialize ncpu in Go.
 //go:linkname setncpu runtime.setncpu
 func setncpu(n int32) {
 	ncpu = n
 }
 
-// Temporary for gccgo until we port malloc.go.
-var physPageSize uintptr
-
 // Temporary for gccgo until we reliably initialize physPageSize in Go.
 //go:linkname setpagesize runtime.setpagesize
 func setpagesize(s uintptr) {
@@ -627,24 +441,6 @@
 }
 
 // Temporary for gccgo until we port mgc.go.
-// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
-// to use in execution traces.
-var gcMarkWorkerModeStrings = [...]string{
-	"GC (dedicated)",
-	"GC (fractional)",
-	"GC (idle)",
-}
-
-// Temporary for gccgo until we port mgc.go.
-func gcenable() {
-	memstats.enablegc = true
-}
-
-// Temporary for gccgo until we port mgc.go.
-func gcinit() {
-}
-
-// Temporary for gccgo until we port mgc.go.
 //go:linkname runtime_m0 runtime.runtime_m0
 func runtime_m0() *m {
 	return &m0
@@ -656,45 +452,9 @@
 	return &g0
 }
 
-// Temporary for gccgo until we port mgc.go.
-type gcMode int
+const uintptrMask = 1<<(8*sys.PtrSize) - 1
 
-const (
-	gcBackgroundMode gcMode = iota // concurrent GC and sweep
-	gcForceMode                    // stop-the-world GC now, concurrent sweep
-	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
-)
-
-// Temporary for gccgo until we port mgc.go.
-func gc(int32)
-func gcStart(mode gcMode, forceTrigger bool) {
-	var force int32
-	if forceTrigger {
-		force = 1
-	}
-	gc(force)
-}
-
-// Temporary for gccgo until we port mgc.go.
-const _FinBlockSize = 4 * 1024
-
-// Temporary for gccgo until we port mgc.go.
-//go:linkname getallfin runtime.getallfin
-func getallfin() *finblock {
-	return allfin
-}
-
-// Temporary for gccgo until we port malloc.go.
-const maxTinySize = 16
-
-// Temporary for gccgo until we port heapdump.go and mgc.go.
-//go:linkname callStopTheWorldWithSema runtime.callStopTheWorldWithSema
-func callStopTheWorldWithSema() {
-	systemstack(stopTheWorldWithSema)
-}
-
-// Temporary for gccgo until we port heapdump.go and mgc.go.
-//go:linkname callStartTheWorldWithSema runtime.callStartTheWorldWithSema
-func callStartTheWorldWithSema() {
-	systemstack(startTheWorldWithSema)
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
 }
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index e891fe5..490405d 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -18,16 +18,8 @@
 func nanotime() int64
 func usleep(usec uint32)
 
-//extern mmap
-func mmap(addr unsafe.Pointer, length uintptr, prot, flags, fd int32, offset uintptr) unsafe.Pointer
-
-//extern munmap
-func munmap(addr unsafe.Pointer, n uintptr) int32
-
 //go:noescape
 func write(fd uintptr, p unsafe.Pointer, n int32) int32
 
 //go:noescape
 func open(name *byte, mode, perm int32) int32
-
-func madvise(addr unsafe.Pointer, n uintptr, flags int32)
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 93408e6..d060e09 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -9,7 +9,7 @@
 
 import (
 	"runtime/internal/sys"
-	_ "unsafe" // for go:linkname
+	"unsafe"
 )
 
 // For gccgo, use go:linkname to rename compiler-called functions to
@@ -20,6 +20,34 @@
 //go:linkname goroutineheader runtime.goroutineheader
 //go:linkname printcreatedby runtime.printcreatedby
 
+var (
+	// initialized in tracebackinit
+	runfinqPC        uintptr
+	bgsweepPC        uintptr
+	forcegchelperPC  uintptr
+	timerprocPC      uintptr
+	gcBgMarkWorkerPC uintptr
+)
+
+func tracebackinit() {
+	// Go variable initialization happens late during runtime startup.
+	// Instead of initializing the variables above in the declarations,
+	// schedinit calls this function so that the variables are
+	// initialized and available earlier in the startup sequence.
+	// This doesn't use funcPC to avoid memory allocation.
+	// FIXME: We should be able to use funcPC when escape analysis is on.
+	f1 := runfinq
+	runfinqPC = **(**uintptr)(unsafe.Pointer(&f1))
+	f2 := bgsweep
+	bgsweepPC = **(**uintptr)(unsafe.Pointer(&f2))
+	f3 := forcegchelper
+	forcegchelperPC = **(**uintptr)(unsafe.Pointer(&f3))
+	f4 := timerproc
+	timerprocPC = **(**uintptr)(unsafe.Pointer(&f4))
+	f5 := gcBgMarkWorker
+	gcBgMarkWorkerPC = **(**uintptr)(unsafe.Pointer(&f5))
+}
+
 func printcreatedby(gp *g) {
 	// Show what created goroutine, except main goroutine (goid 1).
 	pc := gp.gopc
@@ -168,8 +196,15 @@
 // isSystemGoroutine reports whether the goroutine g must be omitted in
 // stack dumps and deadlock detector.
 func isSystemGoroutine(gp *g) bool {
-	// FIXME.
-	return false
+	// FIXME: This doesn't work reliably for gccgo because in many
+	// cases the startpc field will be set to a thunk rather than
+	// to one of these addresses.
+	pc := gp.startpc
+	return pc == runfinqPC && !fingRunning ||
+		pc == bgsweepPC ||
+		pc == forcegchelperPC ||
+		pc == timerprocPC ||
+		pc == gcBgMarkWorkerPC
 }
 
 func tracebackothers(me *g) {
diff --git a/libgo/go/syscall/signame.c b/libgo/go/syscall/signame.c
index 0453c06..dca92a9 100644
--- a/libgo/go/syscall/signame.c
+++ b/libgo/go/syscall/signame.c
@@ -31,7 +31,7 @@
       s = buf;
     }
   len = __builtin_strlen (s);
-  data = runtime_mallocgc (len, 0, FlagNoScan);
+  data = runtime_mallocgc (len, nil, false);
   __builtin_memcpy (data, s, len);
   ret.str = data;
   ret.len = len;
diff --git a/libgo/mkrsysinfo.sh b/libgo/mkrsysinfo.sh
index d5a2c9a..a86e914 100755
--- a/libgo/mkrsysinfo.sh
+++ b/libgo/mkrsysinfo.sh
@@ -112,6 +112,35 @@
     mv ${OUT}-2 ${OUT}
 fi
 
+# Make sure _MAP_FAILED is defined.
+if ! grep '^const _MAP_FAILED =' gen-sysinfo.go > /dev/null 2>&1; then
+  echo "const _MAP_FAILED = ^uintptr(0)" >> ${OUT}
+fi
+# Make sure _MAP_ANON is defined.
+if ! grep '^const _MAP_ANON =' gen-sysinfo.go > /dev/null 2>&1; then
+  if grep '^const _MAP_ANONYMOUS ' gen-sysinfo.go > /dev/null 2>&1; then
+    echo "const _MAP_ANON = _MAP_ANONYMOUS" >> ${OUT}
+  else
+    echo "const _MAP_ANON = 0" >> ${OUT}
+  fi
+fi
+# Make sure _MADV_DONTNEED is defined.
+if ! grep '^const _MADV_DONTNEED =' gen-sysinfo.go > /dev/null 2>&1; then
+  echo "const _MADV_DONTNEED = 0" >> ${OUT}
+fi
+# Make sure _MADV_FREE is defined.
+if ! grep '^const _MADV_FREE =' gen-sysinfo.go > /dev/null 2>&1; then
+  echo "const _MADV_FREE = 0" >> ${OUT}
+fi
+# Make sure _MADV_HUGEPAGE is defined.
+if ! grep '^const _MADV_HUGEPAGE =' gen-sysinfo.go > /dev/null 2>&1; then
+  echo "const _MADV_HUGEPAGE = 0" >> ${OUT}
+fi
+# Make sure _MADV_NOHUGEPAGE is defined.
+if ! grep '^const _MADV_NOHUGEPAGE =' gen-sysinfo.go > /dev/null 2>&1; then
+  echo "const _MADV_NOHUGEPAGE = 0" >> ${OUT}
+fi
+
 # The semt structure, for Solaris.
 grep '^type _sem_t ' gen-sysinfo.go | \
     sed -e 's/_sem_t/semt/' >> ${OUT}
diff --git a/libgo/runtime/env_posix.c b/libgo/runtime/env_posix.c
index 3a60682..8cffa73 100644
--- a/libgo/runtime/env_posix.c
+++ b/libgo/runtime/env_posix.c
@@ -7,7 +7,6 @@
 #include "runtime.h"
 #include "array.h"
 #include "arch.h"
-#include "malloc.h"
 
 extern Slice runtime_get_envs(void);
 
diff --git a/libgo/runtime/go-callers.c b/libgo/runtime/go-callers.c
index b5379f7..09def91 100644
--- a/libgo/runtime/go-callers.c
+++ b/libgo/runtime/go-callers.c
@@ -197,7 +197,10 @@
   int ret;
   int i;
 
-  locbuf = (Location *) runtime_mal (pc.__count * sizeof (Location));
+  /* Note that calling mallocgc here assumes that we are not going to
+     store any allocated Go pointers in the slice.  */
+  locbuf = (Location *) runtime_mallocgc (pc.__count * sizeof (Location),
+					  nil, false);
 
   /* In the Go 1 release runtime.Callers has an off-by-one error,
      which we can not correct because it would break backward
diff --git a/libgo/runtime/go-libmain.c b/libgo/runtime/go-libmain.c
index 5bb7172..06f6bd3 100644
--- a/libgo/runtime/go-libmain.c
+++ b/libgo/runtime/go-libmain.c
@@ -15,7 +15,6 @@
 #include "runtime.h"
 #include "array.h"
 #include "arch.h"
-#include "malloc.h"
 
 /* This is used when building a standalone Go library using the Go
    command's -buildmode=c-archive or -buildmode=c-shared option.  It
diff --git a/libgo/runtime/go-main.c b/libgo/runtime/go-main.c
index 04d5f42..26354ce 100644
--- a/libgo/runtime/go-main.c
+++ b/libgo/runtime/go-main.c
@@ -17,7 +17,6 @@
 #include "runtime.h"
 #include "array.h"
 #include "arch.h"
-#include "malloc.h"
 
 #undef int
 #undef char
diff --git a/libgo/runtime/go-new.c b/libgo/runtime/go-new.c
deleted file mode 100644
index bc5c567..0000000
--- a/libgo/runtime/go-new.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/* go-new.c -- the generic go new() function.
-
-   Copyright 2009 The Go Authors. All rights reserved.
-   Use of this source code is governed by a BSD-style
-   license that can be found in the LICENSE file.  */
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "go-type.h"
-
-void *
-__go_new (const struct __go_type_descriptor *td)
-{
-  return runtime_mallocgc (td->__size,
-			   (uintptr) td | TypeInfo_SingleObject,
-			   td->__code & GO_NO_POINTERS ? FlagNoScan : 0);
-}
diff --git a/libgo/runtime/go-nosys.c b/libgo/runtime/go-nosys.c
index 1628bfa..154f6b1 100644
--- a/libgo/runtime/go-nosys.c
+++ b/libgo/runtime/go-nosys.c
@@ -341,6 +341,18 @@
 }
 #endif
 
+
+#ifndef HAVE_MINCORE
+int
+mincore(void *addr __attribute__ ((unused)),
+	size_t length __attribute__ ((unused)),
+	unsigned char *vec __attribute__ ((unused)))
+{
+  errno = ENOSYS;
+  return -1;
+}
+#endif
+
 /* Long double math functions.  These are needed on old i386 systems
    that don't have them in libm.  The compiler translates calls to
    these functions on float64 to call an 80-bit floating point
diff --git a/libgo/runtime/go-unsafe-new.c b/libgo/runtime/go-unsafe-new.c
deleted file mode 100644
index 07f274f..0000000
--- a/libgo/runtime/go-unsafe-new.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/* go-unsafe-new.c -- unsafe.New function for Go.
-
-   Copyright 2009 The Go Authors. All rights reserved.
-   Use of this source code is governed by a BSD-style
-   license that can be found in the LICENSE file.  */
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "go-type.h"
-
-/* Implement unsafe_New, called from the reflect package.  */
-
-void *unsafe_New (const struct __go_type_descriptor *)
-  __asm__ (GOSYM_PREFIX "reflect.unsafe_New");
-
-/* The dynamic type of the argument will be a pointer to a type
-   descriptor.  */
-
-void *
-unsafe_New (const struct __go_type_descriptor *descriptor)
-{
-  return runtime_cnew (descriptor);
-}
diff --git a/libgo/runtime/go-unsafe-newarray.c b/libgo/runtime/go-unsafe-newarray.c
deleted file mode 100644
index 409ddd9..0000000
--- a/libgo/runtime/go-unsafe-newarray.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/* go-unsafe-newarray.c -- unsafe.NewArray function for Go.
-
-   Copyright 2009 The Go Authors. All rights reserved.
-   Use of this source code is governed by a BSD-style
-   license that can be found in the LICENSE file.  */
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "go-type.h"
-
-/* Implement unsafe_NewArray, called from the reflect package.  */
-
-void *unsafe_NewArray (const struct __go_type_descriptor *, intgo)
-  __asm__ (GOSYM_PREFIX "reflect.unsafe_NewArray");
-
-/* The dynamic type of the argument will be a pointer to a type
-   descriptor.  */
-
-void *
-unsafe_NewArray (const struct __go_type_descriptor *descriptor, intgo n)
-{
-  return runtime_cnewarray (descriptor, n);
-}
diff --git a/libgo/runtime/go-unsafe-pointer.c b/libgo/runtime/go-unsafe-pointer.c
index 483f6ab..9df7f3a 100644
--- a/libgo/runtime/go-unsafe-pointer.c
+++ b/libgo/runtime/go-unsafe-pointer.c
@@ -8,7 +8,6 @@
 
 #include "runtime.h"
 #include "go-type.h"
-#include "mgc0.h"
 
 /* This file provides the type descriptor for the unsafe.Pointer type.
    The unsafe package is defined by the compiler itself, which means
@@ -18,7 +17,7 @@
 extern const struct __go_type_descriptor unsafe_Pointer
   __asm__ (GOSYM_PREFIX "__go_tdn_unsafe.Pointer");
 
-extern const uintptr unsafe_Pointer_gc[]
+extern const byte unsafe_Pointer_gc[]
   __asm__ (GOSYM_PREFIX "__go_tdn_unsafe.Pointer$gc");
 
 /* Used to determine the field alignment.  */
@@ -36,8 +35,7 @@
   sizeof REFLECTION - 1
 };
 
-const uintptr unsafe_Pointer_gc[] __attribute__((aligned(4))) =
-  {sizeof(void*), GC_APTR, 0, GC_END};
+const byte unsafe_Pointer_gc[] = { 1 };
 
 extern const FuncVal runtime_pointerhash_descriptor
   __asm__ (GOSYM_PREFIX "runtime.pointerhash$descriptor");
@@ -63,7 +61,7 @@
   /* __equalfn */
   &runtime_pointerequal_descriptor,
   /* __gcdata */
-  (const byte*)(unsafe_Pointer_gc),
+  unsafe_Pointer_gc,
   /* __reflection */
   &reflection_string,
   /* __uncommon */
@@ -87,11 +85,10 @@
   sizeof PREFLECTION - 1,
 };
 
-extern const uintptr pointer_unsafe_Pointer_gc[]
+extern const byte pointer_unsafe_Pointer_gc[]
   __asm__ (GOSYM_PREFIX "__go_td_pN14_unsafe.Pointer$gc");
 
-const uintptr pointer_unsafe_Pointer_gc[] __attribute__((aligned(4))) =
-  {sizeof(void*), GC_APTR, 0, GC_END};
+const byte pointer_unsafe_Pointer_gc[] = { 1 };
 
 const struct __go_ptr_type pointer_unsafe_Pointer =
 {
@@ -114,7 +111,7 @@
     /* __equalfn */
     &runtime_pointerequal_descriptor,
     /* __gcdata */
-    (const byte*)(pointer_unsafe_Pointer_gc),
+    pointer_unsafe_Pointer_gc,
     /* __reflection */
     &preflection_string,
     /* __uncommon */
diff --git a/libgo/runtime/heapdump.c b/libgo/runtime/heapdump.c
deleted file mode 100644
index 8178946..0000000
--- a/libgo/runtime/heapdump.c
+++ /dev/null
@@ -1,778 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implementation of runtime/debug.WriteHeapDump.  Writes all
-// objects in the heap plus additional info (roots, threads,
-// finalizers, etc.) to a file.
-
-// The format of the dumped file is described at
-// http://code.google.com/p/go-wiki/wiki/heapdump13
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "mgc0.h"
-#include "go-type.h"
-
-#define hash __hash
-#define KindNoPointers GO_NO_POINTERS
-
-enum {
-	FieldKindEol = 0,
-	FieldKindPtr = 1,
-	FieldKindString = 2,
-	FieldKindSlice = 3,
-	FieldKindIface = 4,
-	FieldKindEface = 5,
-
-	TagEOF = 0,
-	TagObject = 1,
-	TagOtherRoot = 2,
-	TagType = 3,
-	TagGoRoutine = 4,
-	TagStackFrame = 5,
-	TagParams = 6,
-	TagFinalizer = 7,
-	TagItab = 8,
-	TagOSThread = 9,
-	TagMemStats = 10,
-	TagQueuedFinalizer = 11,
-	TagData = 12,
-	TagBss = 13,
-	TagDefer = 14,
-	TagPanic = 15,
-	TagMemProf = 16,
-	TagAllocSample = 17,
-
-	TypeInfo_Conservative = 127,
-};
-
-// static uintptr* playgcprog(uintptr offset, uintptr *prog, void (*callback)(void*,uintptr,uintptr), void *arg);
-// static void dumpfields(uintptr *prog);
-static void dumpefacetypes(void *obj, uintptr size, const Type *type, uintptr kind);
-
-// fd to write the dump to.
-static uintptr dumpfd;
-
-// buffer of pending write data
-enum {
-	BufSize = 4096,
-};
-static byte buf[BufSize];
-static uintptr nbuf;
-
-static void
-hwrite(const byte *data, uintptr len)
-{
-	if(len + nbuf <= BufSize) {
-		runtime_memmove(buf + nbuf, data, len);
-		nbuf += len;
-		return;
-	}
-	runtime_write(dumpfd, buf, nbuf);
-	if(len >= BufSize) {
-		runtime_write(dumpfd, data, len);
-		nbuf = 0;
-	} else {
-		runtime_memmove(buf, data, len);
-		nbuf = len;
-	}
-}
-
-static void
-flush(void)
-{
-	runtime_write(dumpfd, buf, nbuf);
-	nbuf = 0;
-}
-
-// Cache of types that have been serialized already.
-// We use a type's hash field to pick a bucket.
-// Inside a bucket, we keep a list of types that
-// have been serialized so far, most recently used first.
-// Note: when a bucket overflows we may end up
-// serializing a type more than once.  That's ok.
-enum {
-	TypeCacheBuckets = 256, // must be a power of 2
-	TypeCacheAssoc = 4,
-};
-typedef struct TypeCacheBucket TypeCacheBucket;
-struct TypeCacheBucket {
-	const Type *t[TypeCacheAssoc];
-};
-static TypeCacheBucket typecache[TypeCacheBuckets];
-
-// dump a uint64 in a varint format parseable by encoding/binary
-static void
-dumpint(uint64 v)
-{
-	byte buf[10];
-	int32 n;
-	n = 0;
-	while(v >= 0x80) {
-		buf[n++] = v | 0x80;
-		v >>= 7;
-	}
-	buf[n++] = v;
-	hwrite(buf, n);
-}
-
-static void
-dumpbool(bool b)
-{
-	dumpint(b ? 1 : 0);
-}
-
-// dump varint uint64 length followed by memory contents
-static void
-dumpmemrange(const byte *data, uintptr len)
-{
-	dumpint(len);
-	hwrite(data, len);
-}
-
-static void
-dumpstr(String s)
-{
-	dumpmemrange(s.str, s.len);
-}
-
-static void
-dumpcstr(const int8 *c)
-{
-	dumpmemrange((const byte*)c, runtime_findnull((const byte*)c));
-}
-
-// dump information for a type
-static void
-dumptype(const Type *t)
-{
-	TypeCacheBucket *b;
-	int32 i, j;
-
-	if(t == nil) {
-		return;
-	}
-
-	// If we've definitely serialized the type before,
-	// no need to do it again.
-	b = &typecache[t->hash & (TypeCacheBuckets-1)];
-	if(t == b->t[0]) return;
-	for(i = 1; i < TypeCacheAssoc; i++) {
-		if(t == b->t[i]) {
-			// Move-to-front
-			for(j = i; j > 0; j--) {
-				b->t[j] = b->t[j-1];
-			}
-			b->t[0] = t;
-			return;
-		}
-	}
-	// Might not have been dumped yet.  Dump it and
-	// remember we did so.
-	for(j = TypeCacheAssoc-1; j > 0; j--) {
-		b->t[j] = b->t[j-1];
-	}
-	b->t[0] = t;
-	
-	// dump the type
-	dumpint(TagType);
-	dumpint((uintptr)t);
-	dumpint(t->__size);
-	if(t->__uncommon == nil || t->__uncommon->__pkg_path == nil || t->__uncommon->__name == nil) {
-		dumpstr(*t->__reflection);
-	} else {
-		dumpint(t->__uncommon->__pkg_path->len + 1 + t->__uncommon->__name->len);
-		hwrite(t->__uncommon->__pkg_path->str, t->__uncommon->__pkg_path->len);
-		hwrite((const byte*)".", 1);
-		hwrite(t->__uncommon->__name->str, t->__uncommon->__name->len);
-	}
-	dumpbool(t->__size > PtrSize || (t->__code & KindNoPointers) == 0);
-	// dumpfields((uintptr*)t->gc + 1);
-}
-
-// returns true if object is scannable
-static bool
-scannable(byte *obj)
-{
-	uintptr *b, off, shift;
-
-	off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	return ((*b >> shift) & bitScan) != 0;
-}
-
-// dump an object
-static void
-dumpobj(byte *obj, uintptr size, const Type *type, uintptr kind)
-{
-	if(type != nil) {
-		dumptype(type);
-		dumpefacetypes(obj, size, type, kind);
-	}
-
-	dumpint(TagObject);
-	dumpint((uintptr)obj);
-	dumpint((uintptr)type);
-	dumpint(kind);
-	dumpmemrange(obj, size);
-}
-
-static void
-dumpotherroot(const char *description, byte *to)
-{
-	dumpint(TagOtherRoot);
-	dumpcstr((const int8 *)description);
-	dumpint((uintptr)to);
-}
-
-static void
-dumpfinalizer(byte *obj, FuncVal *fn, const FuncType* ft, const PtrType *ot)
-{
-	dumpint(TagFinalizer);
-	dumpint((uintptr)obj);
-	dumpint((uintptr)fn);
-	dumpint((uintptr)fn->fn);
-	dumpint((uintptr)ft);
-	dumpint((uintptr)ot);
-}
-
-typedef struct ChildInfo ChildInfo;
-struct ChildInfo {
-	// Information passed up from the callee frame about
-	// the layout of the outargs region.
-	uintptr argoff;     // where the arguments start in the frame
-	uintptr arglen;     // size of args region
-	BitVector args;    // if args.n >= 0, pointer map of args region
-
-	byte *sp;           // callee sp
-	uintptr depth;      // depth in call stack (0 == most recent)
-};
-
-static void
-dumpgoroutine(G *gp)
-{
-	// ChildInfo child;
-	Defer *d;
-	Panic *p;
-
-	dumpint(TagGoRoutine);
-	dumpint((uintptr)gp);
-	dumpint((uintptr)0);
-	dumpint(gp->goid);
-	dumpint(gp->gopc);
-	dumpint(gp->atomicstatus);
-	dumpbool(gp->issystem);
-	dumpbool(gp->isbackground);
-	dumpint(gp->waitsince);
-	dumpstr(gp->waitreason);
-	dumpint((uintptr)0);
-	dumpint((uintptr)gp->m);
-	dumpint((uintptr)gp->_defer);
-	dumpint((uintptr)gp->_panic);
-
-	// dump stack
-	// child.args.n = -1;
-	// child.arglen = 0;
-	// child.sp = nil;
-	// child.depth = 0;
-	// if(!ScanStackByFrames)
-	// 	runtime_throw("need frame info to dump stacks");
-	// runtime_gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, dumpframe, &child, false);
-
-	// dump defer & panic records
-	for(d = gp->_defer; d != nil; d = d->link) {
-		dumpint(TagDefer);
-		dumpint((uintptr)d);
-		dumpint((uintptr)gp);
-		dumpint((uintptr)d->arg);
-		dumpint((uintptr)d->frame);
-		dumpint((uintptr)d->pfn);
-		dumpint((uintptr)0);
-		dumpint((uintptr)d->link);
-	}
-	for (p = gp->_panic; p != nil; p = p->link) {
-		dumpint(TagPanic);
-		dumpint((uintptr)p);
-		dumpint((uintptr)gp);
-		dumpint((uintptr)p->arg._type);
-		dumpint((uintptr)p->arg.data);
-		dumpint((uintptr)0);
-		dumpint((uintptr)p->link);
-	}
-}
-
-static void
-dumpgs(void)
-{
-	G *gp;
-	uint32 i;
-
-	// goroutines & stacks
-	for(i = 0; i < runtime_getallglen(); i++) {
-		gp = runtime_getallg(i);
-		switch(gp->atomicstatus){
-		default:
-			runtime_printf("unexpected G.status %d\n", gp->atomicstatus);
-			runtime_throw("mark - bad status");
-		case _Gdead:
-			break;
-		case _Grunnable:
-		case _Gsyscall:
-		case _Gwaiting:
-			dumpgoroutine(gp);
-			break;
-		}
-	}
-}
-
-static void
-finq_callback(FuncVal *fn, void *obj, const FuncType *ft, const PtrType *ot)
-{
-	dumpint(TagQueuedFinalizer);
-	dumpint((uintptr)obj);
-	dumpint((uintptr)fn);
-	dumpint((uintptr)fn->fn);
-	dumpint((uintptr)ft);
-	dumpint((uintptr)ot);
-}
-
-
-static void
-dumproots(void)
-{
-	MSpan *s, **allspans;
-	uint32 spanidx;
-	Special *sp;
-	SpecialFinalizer *spf;
-	byte *p;
-
-	// data segment
-	// dumpint(TagData);
-	// dumpint((uintptr)data);
-	// dumpmemrange(data, edata - data);
-	// dumpfields((uintptr*)gcdata + 1);
-
-	// bss segment
-	// dumpint(TagBss);
-	// dumpint((uintptr)bss);
-	// dumpmemrange(bss, ebss - bss);
-	// dumpfields((uintptr*)gcbss + 1);
-	
-	// MSpan.types
-	allspans = runtime_mheap.allspans;
-	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
-		s = allspans[spanidx];
-		if(s->state == MSpanInUse) {
-			// The garbage collector ignores type pointers stored in MSpan.types:
-			//  - Compiler-generated types are stored outside of heap.
-			//  - The reflect package has runtime-generated types cached in its data structures.
-			//    The garbage collector relies on finding the references via that cache.
-			switch(s->types.compression) {
-			case MTypes_Empty:
-			case MTypes_Single:
-				break;
-			case MTypes_Words:
-			case MTypes_Bytes:
-				dumpotherroot("runtime type info", (byte*)s->types.data);
-				break;
-			}
-
-			// Finalizers
-			for(sp = s->specials; sp != nil; sp = sp->next) {
-				if(sp->kind != KindSpecialFinalizer)
-					continue;
-				spf = (SpecialFinalizer*)sp;
-				p = (byte*)((s->start << PageShift) + spf->offset);
-				dumpfinalizer(p, spf->fn, spf->ft, spf->ot);
-			}
-		}
-	}
-
-	// Finalizer queue
-	runtime_iterate_finq(finq_callback);
-}
-
-// Bit vector of free marks.
-// Needs to be as big as the largest number of objects per span.
-static byte hfree[PageSize/8];
-
-static void
-dumpobjs(void)
-{
-	uintptr i, j, size, n, off, shift, *bitp, bits, ti, kind;
-	MSpan *s;
-	MLink *l;
-	byte *p;
-	const Type *t;
-
-	for(i = 0; i < runtime_mheap.nspan; i++) {
-		s = runtime_mheap.allspans[i];
-		if(s->state != MSpanInUse)
-			continue;
-		p = (byte*)(s->start << PageShift);
-		size = s->elemsize;
-		n = (s->npages << PageShift) / size;
-		if(n > PageSize/8)
-			runtime_throw("free array doesn't have enough entries");
-		for(l = s->freelist; l != nil; l = l->next) {
-			hfree[((byte*)l - p) / size] = true;
-		}
-		for(j = 0; j < n; j++, p += size) {
-			if(hfree[j]) {
-				hfree[j] = false;
-				continue;
-			}
-			off = (uintptr*)p - (uintptr*)runtime_mheap.arena_start;
-			bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-			shift = off % wordsPerBitmapWord;
-			bits = *bitp >> shift;
-
-			// Skip FlagNoGC allocations (stacks)
-			if((bits & bitAllocated) == 0)
-				continue;
-
-			// extract type and kind
-			ti = runtime_gettype(p);
-			t = (Type*)(ti & ~(uintptr)(PtrSize-1));
-			kind = ti & (PtrSize-1);
-			
-			// dump it
-			if(kind == TypeInfo_Chan)
-				t = ((const ChanType*)t)->__element_type; // use element type for chan encoding
-			if(t == nil && scannable(p))
-				kind = TypeInfo_Conservative; // special kind for conservatively scanned objects
-			dumpobj(p, size, t, kind);
-		}
-	}
-}
-
-static void
-dumpparams(void)
-{
-	byte *x;
-
-	dumpint(TagParams);
-	x = (byte*)1;
-	if(*(byte*)&x == 1)
-		dumpbool(false); // little-endian ptrs
-	else
-		dumpbool(true); // big-endian ptrs
-	dumpint(PtrSize);
-	dumpint(hchanSize);
-	dumpint((uintptr)runtime_mheap.arena_start);
-	dumpint((uintptr)runtime_mheap.arena_used);
-	dumpint(0);
-	dumpcstr((const int8 *)"");
-	dumpint(runtime_ncpu);
-}
-
-static void
-dumpms(void)
-{
-	M *mp;
-
-	for(mp = runtime_getallm(); mp != nil; mp = mp->alllink) {
-		dumpint(TagOSThread);
-		dumpint((uintptr)mp);
-		dumpint(mp->id);
-		dumpint(0);
-	}
-}
-
-static void
-dumpmemstats(void)
-{
-	int32 i;
-
-	dumpint(TagMemStats);
-	dumpint(mstats()->alloc);
-	dumpint(mstats()->total_alloc);
-	dumpint(mstats()->sys);
-	dumpint(mstats()->nlookup);
-	dumpint(mstats()->nmalloc);
-	dumpint(mstats()->nfree);
-	dumpint(mstats()->heap_alloc);
-	dumpint(mstats()->heap_sys);
-	dumpint(mstats()->heap_idle);
-	dumpint(mstats()->heap_inuse);
-	dumpint(mstats()->heap_released);
-	dumpint(mstats()->heap_objects);
-	dumpint(mstats()->stacks_inuse);
-	dumpint(mstats()->stacks_sys);
-	dumpint(mstats()->mspan_inuse);
-	dumpint(mstats()->mspan_sys);
-	dumpint(mstats()->mcache_inuse);
-	dumpint(mstats()->mcache_sys);
-	dumpint(mstats()->buckhash_sys);
-	dumpint(mstats()->gc_sys);
-	dumpint(mstats()->other_sys);
-	dumpint(mstats()->next_gc);
-	dumpint(mstats()->last_gc);
-	dumpint(mstats()->pause_total_ns);
-	for(i = 0; i < 256; i++)
-		dumpint(mstats()->pause_ns[i]);
-	dumpint(mstats()->numgc);
-}
-
-static void
-dumpmemprof_callback(Bucket *b, uintptr nstk, Location *stk, uintptr size, uintptr allocs, uintptr frees)
-{
-	uintptr i, pc;
-	byte buf[20];
-
-	dumpint(TagMemProf);
-	dumpint((uintptr)b);
-	dumpint(size);
-	dumpint(nstk);
-	for(i = 0; i < nstk; i++) {
-		pc = stk[i].pc;
-		if(stk[i].function.len == 0) {
-			runtime_snprintf(buf, sizeof(buf), "%X", (uint64)pc);
-			dumpcstr((int8*)buf);
-			dumpcstr((const int8*)"?");
-			dumpint(0);
-		} else {
-			dumpstr(stk[i].function);
-			dumpstr(stk[i].filename);
-			dumpint(stk[i].lineno);
-		}
-	}
-	dumpint(allocs);
-	dumpint(frees);
-}
-
-static FuncVal dumpmemprof_callbackv = {(void(*)(void))dumpmemprof_callback};
-
-static void
-dumpmemprof(void)
-{
-	MSpan *s, **allspans;
-	uint32 spanidx;
-	Special *sp;
-	SpecialProfile *spp;
-	byte *p;
-
-	runtime_iterate_memprof(&dumpmemprof_callbackv);
-
-	allspans = runtime_mheap.allspans;
-	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
-		s = allspans[spanidx];
-		if(s->state != MSpanInUse)
-			continue;
-		for(sp = s->specials; sp != nil; sp = sp->next) {
-			if(sp->kind != KindSpecialProfile)
-				continue;
-			spp = (SpecialProfile*)sp;
-			p = (byte*)((s->start << PageShift) + spp->offset);
-			dumpint(TagAllocSample);
-			dumpint((uintptr)p);
-			dumpint((uintptr)spp->b);
-		}
-	}
-}
-
-static void
-mdump(G *gp)
-{
-	const byte *hdr;
-	uintptr i;
-	MSpan *s;
-
-	// make sure we're done sweeping
-	for(i = 0; i < runtime_mheap.nspan; i++) {
-		s = runtime_mheap.allspans[i];
-		if(s->state == MSpanInUse)
-			runtime_MSpan_EnsureSwept(s);
-	}
-
-	runtime_memclr((byte*)&typecache[0], sizeof(typecache));
-	hdr = (const byte*)"go1.3 heap dump\n";
-	hwrite(hdr, runtime_findnull(hdr));
-	dumpparams();
-	dumpobjs();
-	dumpgs();
-	dumpms();
-	dumproots();
-	dumpmemstats();
-	dumpmemprof();
-	dumpint(TagEOF);
-	flush();
-
-	gp->param = nil;
-	gp->atomicstatus = _Grunning;
-	runtime_gogo(gp);
-}
-
-static FuncVal mdumpGo = { (void(*)(void))mdump };
-
-void runtime_debug_WriteHeapDump(uintptr)
-  __asm__(GOSYM_PREFIX "runtime_debug.WriteHeapDump");
-
-void
-runtime_debug_WriteHeapDump(uintptr fd)
-{
-	M *m;
-	G *g;
-
-	// Stop the world.
-	runtime_acquireWorldsema();
-	m = runtime_m();
-	m->preemptoff = runtime_gostringnocopy((const byte*)"write heap dump");
-	runtime_callStopTheWorldWithSema();
-
-	// Update stats so we can dump them.
-	// As a side effect, flushes all the MCaches so the MSpan.freelist
-	// lists contain all the free objects.
-	runtime_updatememstats(nil);
-
-	// Set dump file.
-	dumpfd = fd;
-
-	// Call dump routine on M stack.
-	g = runtime_g();
-	g->atomicstatus = _Gwaiting;
-	g->waitreason = runtime_gostringnocopy((const byte*)"dumping heap");
-	runtime_mcall(&mdumpGo);
-
-	// Reset dump file.
-	dumpfd = 0;
-
-	// Start up the world again.
-	runtime_callStartTheWorldWithSema();
-	runtime_releaseWorldsema();
-	m->preemptoff = runtime_gostringnocopy(nil);
-}
-
-// Runs the specified gc program.  Calls the callback for every
-// pointer-like field specified by the program and passes to the
-// callback the kind and offset of that field within the object.
-// offset is the offset in the object of the start of the program.
-// Returns a pointer to the opcode that ended the gc program (either
-// GC_END or GC_ARRAY_NEXT).
-/*
-static uintptr*
-playgcprog(uintptr offset, uintptr *prog, void (*callback)(void*,uintptr,uintptr), void *arg)
-{
-	uintptr len, elemsize, i, *end;
-
-	for(;;) {
-		switch(prog[0]) {
-		case GC_END:
-			return prog;
-		case GC_PTR:
-			callback(arg, FieldKindPtr, offset + prog[1]);
-			prog += 3;
-			break;
-		case GC_APTR:
-			callback(arg, FieldKindPtr, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_ARRAY_START:
-			len = prog[2];
-			elemsize = prog[3];
-			end = nil;
-			for(i = 0; i < len; i++) {
-				end = playgcprog(offset + prog[1] + i * elemsize, prog + 4, callback, arg);
-				if(end[0] != GC_ARRAY_NEXT)
-					runtime_throw("GC_ARRAY_START did not have matching GC_ARRAY_NEXT");
-			}
-			prog = end + 1;
-			break;
-		case GC_ARRAY_NEXT:
-			return prog;
-		case GC_CALL:
-			playgcprog(offset + prog[1], (uintptr*)((byte*)prog + *(int32*)&prog[2]), callback, arg);
-			prog += 3;
-			break;
-		case GC_CHAN_PTR:
-			callback(arg, FieldKindPtr, offset + prog[1]);
-			prog += 3;
-			break;
-		case GC_STRING:
-			callback(arg, FieldKindString, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_EFACE:
-			callback(arg, FieldKindEface, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_IFACE:
-			callback(arg, FieldKindIface, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_SLICE:
-			callback(arg, FieldKindSlice, offset + prog[1]);
-			prog += 3;
-			break;
-		case GC_REGION:
-			playgcprog(offset + prog[1], (uintptr*)prog[3] + 1, callback, arg);
-			prog += 4;
-			break;
-		default:
-			runtime_printf("%D\n", (uint64)prog[0]);
-			runtime_throw("bad gc op");
-		}
-	}
-}
-
-static void
-dump_callback(void *p, uintptr kind, uintptr offset)
-{
-	USED(&p);
-	dumpint(kind);
-	dumpint(offset);
-}
-
-// dumpint() the kind & offset of each field in an object.
-static void
-dumpfields(uintptr *prog)
-{
-	playgcprog(0, prog, dump_callback, nil);
-	dumpint(FieldKindEol);
-}
-
-static void
-dumpeface_callback(void *p, uintptr kind, uintptr offset)
-{
-	Eface *e;
-
-	if(kind != FieldKindEface)
-		return;
-	e = (Eface*)((byte*)p + offset);
-	dumptype(e->__type_descriptor);
-}
-*/
-
-// The heap dump reader needs to be able to disambiguate
-// Eface entries.  So it needs to know every type that might
-// appear in such an entry.  The following two routines accomplish
-// that.
-
-// Dump all the types that appear in the type field of
-// any Eface contained in obj.
-static void
-dumpefacetypes(void *obj __attribute__ ((unused)), uintptr size, const Type *type, uintptr kind)
-{
-	uintptr i;
-
-	switch(kind) {
-	case TypeInfo_SingleObject:
-		//playgcprog(0, (uintptr*)type->gc + 1, dumpeface_callback, obj);
-		break;
-	case TypeInfo_Array:
-		for(i = 0; i <= size - type->__size; i += type->__size) {
-			//playgcprog(i, (uintptr*)type->gc + 1, dumpeface_callback, obj);
-		}
-		break;
-	case TypeInfo_Chan:
-		if(type->__size == 0) // channels may have zero-sized objects in them
-			break;
-		for(i = hchanSize; i <= size - type->__size; i += type->__size) {
-			//playgcprog(i, (uintptr*)type->gc + 1, dumpeface_callback, obj);
-		}
-		break;
-	}
-}
diff --git a/libgo/runtime/malloc.goc b/libgo/runtime/malloc.goc
deleted file mode 100644
index 90150a2..0000000
--- a/libgo/runtime/malloc.goc
+++ /dev/null
@@ -1,813 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// See malloc.h for overview.
-//
-// TODO(rsc): double-check stats.
-
-package runtime
-#include <stddef.h>
-#include <errno.h>
-#include <stdlib.h>
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "go-type.h"
-
-// Map gccgo field names to gc field names.
-// Type aka __go_type_descriptor
-#define kind __code
-#define string __reflection
-
-// GCCGO SPECIFIC CHANGE
-//
-// There is a long comment in runtime_mallocinit about where to put the heap
-// on a 64-bit system.  It makes assumptions that are not valid on linux/arm64
-// -- it assumes user space can choose the lower 47 bits of a pointer, but on
-// linux/arm64 we can only choose the lower 39 bits.  This means the heap is
-// roughly a quarter of the available address space and we cannot choose a bit
-// pattern that all pointers will have -- luckily the GC is mostly precise
-// these days so this doesn't matter all that much.  The kernel (as of 3.13)
-// will allocate address space starting either down from 0x7fffffffff or up
-// from 0x2000000000, so we put the heap roughly in the middle of these two
-// addresses to minimize the chance that a non-heap allocation will get in the
-// way of the heap.
-//
-// This all means that there isn't much point in trying 256 different
-// locations for the heap on such systems.
-#ifdef __aarch64__
-#define HeapBase(i) ((void*)(uintptr)(0x40ULL<<32))
-#define HeapBaseOptions 1
-#elif defined(_AIX)
-// mmap adresses range start at 0x07000000_00000000 on AIX for 64 bits processes
-#define HeapBase(i) ((void*)(uintptr)(0x70ULL<<52))
-#define HeapBaseOptions 1
-#else
-#define HeapBase(i) ((void*)(uintptr)(i<<40|0x00c0ULL<<32))
-#define HeapBaseOptions 0x80
-#endif
-// END GCCGO SPECIFIC CHANGE
-
-// Mark mheap as 'no pointers', it does not contain interesting pointers but occupies ~45K.
-MHeap runtime_mheap;
-
-int32	runtime_checking;
-
-extern volatile intgo runtime_MemProfileRate
-  __asm__ (GOSYM_PREFIX "runtime.MemProfileRate");
-
-static MSpan* largealloc(uint32, uintptr*);
-static void runtime_profilealloc(void *v, uintptr size);
-static void settype(MSpan *s, void *v, uintptr typ);
-
-// Allocate an object of at least size bytes.
-// Small objects are allocated from the per-thread cache's free lists.
-// Large objects (> 32 kB) are allocated straight from the heap.
-// If the block will be freed with runtime_free(), typ must be 0.
-void*
-runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
-{
-	M *m;
-	G *g;
-	int32 sizeclass;
-	uintptr tinysize, size1;
-	intgo rate;
-	MCache *c;
-	MSpan *s;
-	MLink *v, *next;
-	byte *tiny;
-	bool incallback;
-	MStats *pmstats;
-
-	if(size == 0) {
-		// All 0-length allocations use this pointer.
-		// The language does not require the allocations to
-		// have distinct values.
-		return runtime_getZerobase();
-	}
-
-	g = runtime_g();
-	m = g->m;
-
-	incallback = false;
-	if(m->mcache == nil && m->ncgo > 0) {
-		// For gccgo this case can occur when a cgo or SWIG function
-		// has an interface return type and the function
-		// returns a non-pointer, so memory allocation occurs
-		// after syscall.Cgocall but before syscall.CgocallDone.
-		// We treat it as a callback.
-		runtime_exitsyscall(0);
-		m = runtime_m();
-		incallback = true;
-		flag |= FlagNoInvokeGC;
-	}
-
-	if((g->preempt || runtime_gcwaiting()) && g != m->g0 && m->locks == 0 && !(flag & FlagNoInvokeGC) && m->preemptoff.len == 0) {
-		g->preempt = false;
-		runtime_gosched();
-		m = runtime_m();
-	}
-	if(m->mallocing)
-		runtime_throw("malloc/free - deadlock");
-	// Disable preemption during settype.
-	// We can not use m->mallocing for this, because settype calls mallocgc.
-	m->locks++;
-	m->mallocing = 1;
-
-	if(DebugTypeAtBlockEnd)
-		size += sizeof(uintptr);
-
-	c = m->mcache;
-	if(!runtime_debug.efence && size <= MaxSmallSize) {
-		if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
-			// Tiny allocator.
-			//
-			// Tiny allocator combines several tiny allocation requests
-			// into a single memory block. The resulting memory block
-			// is freed when all subobjects are unreachable. The subobjects
-			// must be FlagNoScan (don't have pointers), this ensures that
-			// the amount of potentially wasted memory is bounded.
-			//
-			// Size of the memory block used for combining (TinySize) is tunable.
-			// Current setting is 16 bytes, which relates to 2x worst case memory
-			// wastage (when all but one subobjects are unreachable).
-			// 8 bytes would result in no wastage at all, but provides less
-			// opportunities for combining.
-			// 32 bytes provides more opportunities for combining,
-			// but can lead to 4x worst case wastage.
-			// The best case winning is 8x regardless of block size.
-			//
-			// Objects obtained from tiny allocator must not be freed explicitly.
-			// So when an object will be freed explicitly, we ensure that
-			// its size >= TinySize.
-			//
-			// SetFinalizer has a special case for objects potentially coming
-			// from tiny allocator, it such case it allows to set finalizers
-			// for an inner byte of a memory block.
-			//
-			// The main targets of tiny allocator are small strings and
-			// standalone escaping variables. On a json benchmark
-			// the allocator reduces number of allocations by ~12% and
-			// reduces heap size by ~20%.
-
-			tinysize = c->tinysize;
-			if(size <= tinysize) {
-				tiny = c->tiny;
-				// Align tiny pointer for required (conservative) alignment.
-				if((size&7) == 0)
-					tiny = (byte*)ROUND((uintptr)tiny, 8);
-				else if((size&3) == 0)
-					tiny = (byte*)ROUND((uintptr)tiny, 4);
-				else if((size&1) == 0)
-					tiny = (byte*)ROUND((uintptr)tiny, 2);
-				size1 = size + (tiny - (byte*)c->tiny);
-				if(size1 <= tinysize) {
-					// The object fits into existing tiny block.
-					v = (MLink*)tiny;
-					c->tiny = (byte*)c->tiny + size1;
-					c->tinysize -= size1;
-					m->mallocing = 0;
-					m->locks--;
-					if(incallback)
-						runtime_entersyscall(0);
-					return v;
-				}
-			}
-			// Allocate a new TinySize block.
-			s = c->alloc[TinySizeClass];
-			if(s->freelist == nil)
-				s = runtime_MCache_Refill(c, TinySizeClass);
-			v = s->freelist;
-			next = v->next;
-			s->freelist = next;
-			s->ref++;
-			if(next != nil)  // prefetching nil leads to a DTLB miss
-				PREFETCH(next);
-			((uint64*)v)[0] = 0;
-			((uint64*)v)[1] = 0;
-			// See if we need to replace the existing tiny block with the new one
-			// based on amount of remaining free space.
-			if(TinySize-size > tinysize) {
-				c->tiny = (byte*)v + size;
-				c->tinysize = TinySize - size;
-			}
-			size = TinySize;
-			goto done;
-		}
-		// Allocate from mcache free lists.
-		// Inlined version of SizeToClass().
-		if(size <= 1024-8)
-			sizeclass = runtime_size_to_class8[(size+7)>>3];
-		else
-			sizeclass = runtime_size_to_class128[(size-1024+127) >> 7];
-		size = runtime_class_to_size[sizeclass];
-		s = c->alloc[sizeclass];
-		if(s->freelist == nil)
-			s = runtime_MCache_Refill(c, sizeclass);
-		v = s->freelist;
-		next = v->next;
-		s->freelist = next;
-		s->ref++;
-		if(next != nil)  // prefetching nil leads to a DTLB miss
-			PREFETCH(next);
-		if(!(flag & FlagNoZero)) {
-			v->next = nil;
-			// block is zeroed iff second word is zero ...
-			if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
-				runtime_memclr((byte*)v, size);
-		}
-	done:
-		c->local_cachealloc += size;
-	} else {
-		// Allocate directly from heap.
-		s = largealloc(flag, &size);
-		v = (void*)(s->start << PageShift);
-	}
-
-	if(flag & FlagNoGC)
-		runtime_marknogc(v);
-	else if(!(flag & FlagNoScan))
-		runtime_markscan(v);
-
-	if(DebugTypeAtBlockEnd)
-		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = typ;
-
-	m->mallocing = 0;
-	// TODO: save type even if FlagNoScan?  Potentially expensive but might help
-	// heap profiling/tracing.
-	if(UseSpanType && !(flag & FlagNoScan) && typ != 0)
-		settype(s, v, typ);
-
-	if(runtime_debug.allocfreetrace)
-		runtime_tracealloc(v, size, typ);
-
-	if(!(flag & FlagNoProfiling) && (rate = runtime_MemProfileRate) > 0) {
-		if(size < (uintptr)rate && size < (uintptr)(uint32)c->next_sample)
-			c->next_sample -= size;
-		else
-			runtime_profilealloc(v, size);
-	}
-
-	m->locks--;
-
-	pmstats = mstats();
-	if(!(flag & FlagNoInvokeGC) && pmstats->heap_alloc >= pmstats->next_gc)
-		runtime_gc(0);
-
-	if(incallback)
-		runtime_entersyscall(0);
-
-	return v;
-}
-
-static MSpan*
-largealloc(uint32 flag, uintptr *sizep)
-{
-	uintptr npages, size;
-	MSpan *s;
-	void *v;
-
-	// Allocate directly from heap.
-	size = *sizep;
-	if(size + PageSize < size)
-		runtime_throw("out of memory");
-	npages = size >> PageShift;
-	if((size & PageMask) != 0)
-		npages++;
-	s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1, !(flag & FlagNoZero));
-	if(s == nil)
-		runtime_throw("out of memory");
-	s->limit = (uintptr)((byte*)(s->start<<PageShift) + size);
-	*sizep = npages<<PageShift;
-	v = (void*)(s->start << PageShift);
-	// setup for mark sweep
-	runtime_markspan(v, 0, 0, true);
-	return s;
-}
-
-static void
-runtime_profilealloc(void *v, uintptr size)
-{
-	uintptr rate;
-	int32 next;
-	MCache *c;
-
-	c = runtime_m()->mcache;
-	rate = runtime_MemProfileRate;
-	if(size < rate) {
-		// pick next profile time
-		// If you change this, also change allocmcache.
-		if(rate > 0x3fffffff)	// make 2*rate not overflow
-			rate = 0x3fffffff;
-		next = runtime_fastrand() % (2*rate);
-		// Subtract the "remainder" of the current allocation.
-		// Otherwise objects that are close in size to sampling rate
-		// will be under-sampled, because we consistently discard this remainder.
-		next -= (size - c->next_sample);
-		if(next < 0)
-			next = 0;
-		c->next_sample = next;
-	}
-	runtime_MProf_Malloc(v, size);
-}
-
-int32
-runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
-{
-	M *m;
-	uintptr n, i;
-	byte *p;
-	MSpan *s;
-
-	m = runtime_m();
-
-	m->mcache->local_nlookup++;
-	if (sizeof(void*) == 4 && m->mcache->local_nlookup >= (1<<30)) {
-		// purge cache stats to prevent overflow
-		runtime_lock(&runtime_mheap);
-		runtime_purgecachedstats(m->mcache);
-		runtime_unlock(&runtime_mheap);
-	}
-
-	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
-	if(sp)
-		*sp = s;
-	if(s == nil) {
-		runtime_checkfreed(v, 1);
-		if(base)
-			*base = nil;
-		if(size)
-			*size = 0;
-		return 0;
-	}
-
-	p = (byte*)((uintptr)s->start<<PageShift);
-	if(s->sizeclass == 0) {
-		// Large object.
-		if(base)
-			*base = p;
-		if(size)
-			*size = s->npages<<PageShift;
-		return 1;
-	}
-
-	n = s->elemsize;
-	if(base) {
-		i = ((byte*)v - p)/n;
-		*base = p + i*n;
-	}
-	if(size)
-		*size = n;
-
-	return 1;
-}
-
-void
-runtime_purgecachedstats(MCache *c)
-{
-	MHeap *h;
-	int32 i;
-
-	// Protected by either heap or GC lock.
-	h = &runtime_mheap;
-	mstats()->heap_alloc += (intptr)c->local_cachealloc;
-	c->local_cachealloc = 0;
-	mstats()->nlookup += c->local_nlookup;
-	c->local_nlookup = 0;
-	h->largefree += c->local_largefree;
-	c->local_largefree = 0;
-	h->nlargefree += c->local_nlargefree;
-	c->local_nlargefree = 0;
-	for(i=0; i<(int32)nelem(c->local_nsmallfree); i++) {
-		h->nsmallfree[i] += c->local_nsmallfree[i];
-		c->local_nsmallfree[i] = 0;
-	}
-}
-
-// Initialized in mallocinit because it's defined in go/runtime/mem.go.
-
-#define MaxArena32 (2U<<30)
-
-void
-runtime_mallocinit(void)
-{
-	byte *p, *p1;
-	uintptr arena_size, bitmap_size, spans_size, p_size;
-	uintptr *pend;
-	uintptr end;
-	uintptr limit;
-	uint64 i;
-	bool reserved;
-
-	p = nil;
-	p_size = 0;
-	arena_size = 0;
-	bitmap_size = 0;
-	spans_size = 0;
-	reserved = false;
-
-	// for 64-bit build
-	USED(p);
-	USED(p_size);
-	USED(arena_size);
-	USED(bitmap_size);
-	USED(spans_size);
-
-	runtime_InitSizes();
-
-	if(runtime_class_to_size[TinySizeClass] != TinySize)
-		runtime_throw("bad TinySizeClass");
-
-	// limit = runtime_memlimit();
-	// See https://code.google.com/p/go/issues/detail?id=5049
-	// TODO(rsc): Fix after 1.1.
-	limit = 0;
-
-	// Set up the allocation arena, a contiguous area of memory where
-	// allocated data will be found.  The arena begins with a bitmap large
-	// enough to hold 4 bits per allocated word.
-	if(sizeof(void*) == 8 && (limit == 0 || limit > (1<<30))) {
-		// On a 64-bit machine, allocate from a single contiguous reservation.
-		// 128 GB (MaxMem) should be big enough for now.
-		//
-		// The code will work with the reservation at any address, but ask
-		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
-		// Allocating a 128 GB region takes away 37 bits, and the amd64
-		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
-		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
-		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
-		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
-		// UTF-8 sequences, and they are otherwise as far away from 
-		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
-		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
-		// on OS X during thread allocations.  0x00c0 causes conflicts with
-		// AddressSanitizer which reserves all memory up to 0x0100.
-		// These choices are both for debuggability and to reduce the
-		// odds of the conservative garbage collector not collecting memory
-		// because some non-pointer block of memory had a bit pattern
-		// that matched a memory address.
-		//
-		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
-		// but it hardly matters: e0 00 is not valid UTF-8 either.
-		//
-		// If this fails we fall back to the 32 bit memory mechanism
-		arena_size = MaxMem;
-		bitmap_size = arena_size / (sizeof(void*)*8/4);
-		spans_size = arena_size / PageSize * sizeof(runtime_mheap.spans[0]);
-		spans_size = ROUND(spans_size, PageSize);
-		for(i = 0; i < HeapBaseOptions; i++) {
-			p = HeapBase(i);
-			p_size = bitmap_size + spans_size + arena_size + PageSize;
-			p = runtime_SysReserve(p, p_size, &reserved);
-			if(p != nil)
-				break;
-		}
-	}
-	if (p == nil) {
-		// On a 32-bit machine, we can't typically get away
-		// with a giant virtual address space reservation.
-		// Instead we map the memory information bitmap
-		// immediately after the data segment, large enough
-		// to handle another 2GB of mappings (256 MB),
-		// along with a reservation for another 512 MB of memory.
-		// When that gets used up, we'll start asking the kernel
-		// for any memory anywhere and hope it's in the 2GB
-		// following the bitmap (presumably the executable begins
-		// near the bottom of memory, so we'll have to use up
-		// most of memory before the kernel resorts to giving out
-		// memory before the beginning of the text segment).
-		//
-		// Alternatively we could reserve 512 MB bitmap, enough
-		// for 4GB of mappings, and then accept any memory the
-		// kernel threw at us, but normally that's a waste of 512 MB
-		// of address space, which is probably too much in a 32-bit world.
-		bitmap_size = MaxArena32 / (sizeof(void*)*8/4);
-		arena_size = 512<<20;
-		spans_size = MaxArena32 / PageSize * sizeof(runtime_mheap.spans[0]);
-		if(limit > 0 && arena_size+bitmap_size+spans_size > limit) {
-			bitmap_size = (limit / 9) & ~((1<<PageShift) - 1);
-			arena_size = bitmap_size * 8;
-			spans_size = arena_size / PageSize * sizeof(runtime_mheap.spans[0]);
-		}
-		spans_size = ROUND(spans_size, PageSize);
-
-		// SysReserve treats the address we ask for, end, as a hint,
-		// not as an absolute requirement.  If we ask for the end
-		// of the data segment but the operating system requires
-		// a little more space before we can start allocating, it will
-		// give out a slightly higher pointer.  Except QEMU, which
-		// is buggy, as usual: it won't adjust the pointer upward.
-		// So adjust it upward a little bit ourselves: 1/4 MB to get
-		// away from the running binary image and then round up
-		// to a MB boundary.
-
-#ifdef _AIX
-		// mmap adresses range start at 0x30000000 on AIX for 32 bits processes
-		end = 0x30000000U;
-#else
-		end = 0;
-		pend = &__go_end;
-		if(pend != nil)
-			end = *pend;
-#endif
-		p = (byte*)ROUND(end + (1<<18), 1<<20);
-		p_size = bitmap_size + spans_size + arena_size + PageSize;
-		p = runtime_SysReserve(p, p_size, &reserved);
-		if(p == nil)
-			runtime_throw("runtime: cannot reserve arena virtual address space");
-	}
-
-	// PageSize can be larger than OS definition of page size,
-	// so SysReserve can give us a PageSize-unaligned pointer.
-	// To overcome this we ask for PageSize more and round up the pointer.
-	p1 = (byte*)ROUND((uintptr)p, PageSize);
-
-	runtime_mheap.spans = (MSpan**)p1;
-	runtime_mheap.bitmap = p1 + spans_size;
-	runtime_mheap.arena_start = p1 + spans_size + bitmap_size;
-	runtime_mheap.arena_used = runtime_mheap.arena_start;
-	runtime_mheap.arena_end = p + p_size;
-	runtime_mheap.arena_reserved = reserved;
-
-	if(((uintptr)runtime_mheap.arena_start & (PageSize-1)) != 0)
-		runtime_throw("misrounded allocation in mallocinit");
-
-	// Initialize the rest of the allocator.	
-	runtime_MHeap_Init(&runtime_mheap);
-	runtime_m()->mcache = runtime_allocmcache();
-}
-
-void*
-runtime_MHeap_SysAlloc(MHeap *h, uintptr n)
-{
-	byte *p, *p_end;
-	uintptr p_size;
-	bool reserved;
-
-
-	if(n > (uintptr)(h->arena_end - h->arena_used)) {
-		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
-		// Reserve some more space.
-		byte *new_end;
-
-		p_size = ROUND(n + PageSize, 256<<20);
-		new_end = h->arena_end + p_size;
-		if(new_end <= h->arena_start + MaxArena32) {
-			// TODO: It would be bad if part of the arena
-			// is reserved and part is not.
-			p = runtime_SysReserve(h->arena_end, p_size, &reserved);
-			if(p == h->arena_end) {
-				h->arena_end = new_end;
-				h->arena_reserved = reserved;
-			}
-			else if(p+p_size <= h->arena_start + MaxArena32) {
-				// Keep everything page-aligned.
-				// Our pages are bigger than hardware pages.
-				h->arena_end = p+p_size;
-				h->arena_used = p + (-(uintptr)p&(PageSize-1));
-				h->arena_reserved = reserved;
-			} else {
-				uint64 stat;
-				stat = 0;
-				runtime_SysFree(p, p_size, &stat);
-			}
-		}
-	}
-	if(n <= (uintptr)(h->arena_end - h->arena_used)) {
-		// Keep taking from our reservation.
-		p = h->arena_used;
-		runtime_SysMap(p, n, h->arena_reserved, &mstats()->heap_sys);
-		h->arena_used += n;
-		runtime_MHeap_MapBits(h);
-		runtime_MHeap_MapSpans(h);
-		
-		if(((uintptr)p & (PageSize-1)) != 0)
-			runtime_throw("misrounded allocation in MHeap_SysAlloc");
-		return p;
-	}
-	
-	// If using 64-bit, our reservation is all we have.
-	if((uintptr)(h->arena_end - h->arena_start) >= MaxArena32)
-		return nil;
-
-	// On 32-bit, once the reservation is gone we can
-	// try to get memory at a location chosen by the OS
-	// and hope that it is in the range we allocated bitmap for.
-	p_size = ROUND(n, PageSize) + PageSize;
-	p = runtime_SysAlloc(p_size, &mstats()->heap_sys);
-	if(p == nil)
-		return nil;
-
-	if(p < h->arena_start || (uintptr)(p+p_size - h->arena_start) >= MaxArena32) {
-		runtime_printf("runtime: memory allocated by OS (%p) not in usable range [%p,%p)\n",
-			p, h->arena_start, h->arena_start+MaxArena32);
-		runtime_SysFree(p, p_size, &mstats()->heap_sys);
-		return nil;
-	}
-	
-	p_end = p + p_size;
-	p += -(uintptr)p & (PageSize-1);
-	if(p+n > h->arena_used) {
-		h->arena_used = p+n;
-		if(p_end > h->arena_end)
-			h->arena_end = p_end;
-		runtime_MHeap_MapBits(h);
-		runtime_MHeap_MapSpans(h);
-	}
-	
-	if(((uintptr)p & (PageSize-1)) != 0)
-		runtime_throw("misrounded allocation in MHeap_SysAlloc");
-	return p;
-}
-
-static struct
-{
-	Lock;
-	byte*	pos;
-	byte*	end;
-} persistent;
-
-enum
-{
-	PersistentAllocChunk	= 256<<10,
-	PersistentAllocMaxBlock	= 64<<10,  // VM reservation granularity is 64K on windows
-};
-
-// Wrapper around SysAlloc that can allocate small chunks.
-// There is no associated free operation.
-// Intended for things like function/type/debug-related persistent data.
-// If align is 0, uses default align (currently 8).
-void*
-runtime_persistentalloc(uintptr size, uintptr align, uint64 *stat)
-{
-	byte *p;
-
-	if(align != 0) {
-		if(align&(align-1))
-			runtime_throw("persistentalloc: align is not a power of 2");
-		if(align > PageSize)
-			runtime_throw("persistentalloc: align is too large");
-	} else
-		align = 8;
-	if(size >= PersistentAllocMaxBlock)
-		return runtime_SysAlloc(size, stat);
-	runtime_lock(&persistent);
-	persistent.pos = (byte*)ROUND((uintptr)persistent.pos, align);
-	if(persistent.pos + size > persistent.end) {
-		persistent.pos = runtime_SysAlloc(PersistentAllocChunk, &mstats()->other_sys);
-		if(persistent.pos == nil) {
-			runtime_unlock(&persistent);
-			runtime_throw("runtime: cannot allocate memory");
-		}
-		persistent.end = persistent.pos + PersistentAllocChunk;
-	}
-	p = persistent.pos;
-	persistent.pos += size;
-	runtime_unlock(&persistent);
-	if(stat != &mstats()->other_sys) {
-		// reaccount the allocation against provided stat
-		runtime_xadd64(stat, size);
-		runtime_xadd64(&mstats()->other_sys, -(uint64)size);
-	}
-	return p;
-}
-
-static void
-settype(MSpan *s, void *v, uintptr typ)
-{
-	uintptr size, ofs, j, t;
-	uintptr ntypes, nbytes2, nbytes3;
-	uintptr *data2;
-	byte *data3;
-
-	if(s->sizeclass == 0) {
-		s->types.compression = MTypes_Single;
-		s->types.data = typ;
-		return;
-	}
-	size = s->elemsize;
-	ofs = ((uintptr)v - (s->start<<PageShift)) / size;
-
-	switch(s->types.compression) {
-	case MTypes_Empty:
-		ntypes = (s->npages << PageShift) / size;
-		nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
-		data3 = runtime_mallocgc(nbytes3, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
-		s->types.compression = MTypes_Bytes;
-		s->types.data = (uintptr)data3;
-		((uintptr*)data3)[1] = typ;
-		data3[8*sizeof(uintptr) + ofs] = 1;
-		break;
-		
-	case MTypes_Words:
-		((uintptr*)s->types.data)[ofs] = typ;
-		break;
-		
-	case MTypes_Bytes:
-		data3 = (byte*)s->types.data;
-		for(j=1; j<8; j++) {
-			if(((uintptr*)data3)[j] == typ) {
-				break;
-			}
-			if(((uintptr*)data3)[j] == 0) {
-				((uintptr*)data3)[j] = typ;
-				break;
-			}
-		}
-		if(j < 8) {
-			data3[8*sizeof(uintptr) + ofs] = j;
-		} else {
-			ntypes = (s->npages << PageShift) / size;
-			nbytes2 = ntypes * sizeof(uintptr);
-			data2 = runtime_mallocgc(nbytes2, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
-			s->types.compression = MTypes_Words;
-			s->types.data = (uintptr)data2;
-			
-			// Move the contents of data3 to data2. Then deallocate data3.
-			for(j=0; j<ntypes; j++) {
-				t = data3[8*sizeof(uintptr) + j];
-				t = ((uintptr*)data3)[t];
-				data2[j] = t;
-			}
-			data2[ofs] = typ;
-		}
-		break;
-	}
-}
-
-uintptr
-runtime_gettype(void *v)
-{
-	MSpan *s;
-	uintptr t, ofs;
-	byte *data;
-
-	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
-	if(s != nil) {
-		t = 0;
-		switch(s->types.compression) {
-		case MTypes_Empty:
-			break;
-		case MTypes_Single:
-			t = s->types.data;
-			break;
-		case MTypes_Words:
-			ofs = (uintptr)v - (s->start<<PageShift);
-			t = ((uintptr*)s->types.data)[ofs/s->elemsize];
-			break;
-		case MTypes_Bytes:
-			ofs = (uintptr)v - (s->start<<PageShift);
-			data = (byte*)s->types.data;
-			t = data[8*sizeof(uintptr) + ofs/s->elemsize];
-			t = ((uintptr*)data)[t];
-			break;
-		default:
-			runtime_throw("runtime_gettype: invalid compression kind");
-		}
-		if(0) {
-			runtime_printf("%p -> %d,%X\n", v, (int32)s->types.compression, (int64)t);
-		}
-		return t;
-	}
-	return 0;
-}
-
-// Runtime stubs.
-
-void*
-runtime_mal(uintptr n)
-{
-	return runtime_mallocgc(n, 0, 0);
-}
-
-func new(typ *Type) (ret *uint8) {
-	ret = runtime_mallocgc(typ->__size, (uintptr)typ | TypeInfo_SingleObject, typ->kind&kindNoPointers ? FlagNoScan : 0);
-}
-
-static void*
-runtime_docnew(const Type *typ, intgo n, int32 objtyp)
-{
-	if((objtyp&(PtrSize-1)) != objtyp)
-		runtime_throw("runtime: invalid objtyp");
-	if(n < 0 || (typ->__size > 0 && (uintptr)n > (MaxMem/typ->__size)))
-		runtime_panicstring("runtime: allocation size out of range");
-	return runtime_mallocgc(typ->__size*n, (uintptr)typ | objtyp, typ->kind&kindNoPointers ? FlagNoScan : 0);
-}
-
-// same as runtime_new, but callable from C
-void*
-runtime_cnew(const Type *typ)
-{
-	return runtime_docnew(typ, 1, TypeInfo_SingleObject);
-}
-
-void*
-runtime_cnewarray(const Type *typ, intgo n)
-{
-	return runtime_docnew(typ, n, TypeInfo_Array);
-}
-
-func GC() {
-	runtime_gc(2);  // force GC and do eager sweep
-}
diff --git a/libgo/runtime/malloc.h b/libgo/runtime/malloc.h
deleted file mode 100644
index abc8fc6..0000000
--- a/libgo/runtime/malloc.h
+++ /dev/null
@@ -1,542 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Memory allocator, based on tcmalloc.
-// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
-
-// The main allocator works in runs of pages.
-// Small allocation sizes (up to and including 32 kB) are
-// rounded to one of about 100 size classes, each of which
-// has its own free list of objects of exactly that size.
-// Any free page of memory can be split into a set of objects
-// of one size class, which are then managed using free list
-// allocators.
-//
-// The allocator's data structures are:
-//
-//	FixAlloc: a free-list allocator for fixed-size objects,
-//		used to manage storage used by the allocator.
-//	MHeap: the malloc heap, managed at page (4096-byte) granularity.
-//	MSpan: a run of pages managed by the MHeap.
-//	MCentral: a shared free list for a given size class.
-//	MCache: a per-thread (in Go, per-P) cache for small objects.
-//	MStats: allocation statistics.
-//
-// Allocating a small object proceeds up a hierarchy of caches:
-//
-//	1. Round the size up to one of the small size classes
-//	   and look in the corresponding MCache free list.
-//	   If the list is not empty, allocate an object from it.
-//	   This can all be done without acquiring a lock.
-//
-//	2. If the MCache free list is empty, replenish it by
-//	   taking a bunch of objects from the MCentral free list.
-//	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
-//
-//	3. If the MCentral free list is empty, replenish it by
-//	   allocating a run of pages from the MHeap and then
-//	   chopping that memory into a objects of the given size.
-//	   Allocating many objects amortizes the cost of locking
-//	   the heap.
-//
-//	4. If the MHeap is empty or has no page runs large enough,
-//	   allocate a new group of pages (at least 1MB) from the
-//	   operating system.  Allocating a large run of pages
-//	   amortizes the cost of talking to the operating system.
-//
-// Freeing a small object proceeds up the same hierarchy:
-//
-//	1. Look up the size class for the object and add it to
-//	   the MCache free list.
-//
-//	2. If the MCache free list is too long or the MCache has
-//	   too much memory, return some to the MCentral free lists.
-//
-//	3. If all the objects in a given span have returned to
-//	   the MCentral list, return that span to the page heap.
-//
-//	4. If the heap has too much memory, return some to the
-//	   operating system.
-//
-//	TODO(rsc): Step 4 is not implemented.
-//
-// Allocating and freeing a large object uses the page heap
-// directly, bypassing the MCache and MCentral free lists.
-//
-// The small objects on the MCache and MCentral free lists
-// may or may not be zeroed.  They are zeroed if and only if
-// the second word of the object is zero.  A span in the
-// page heap is zeroed unless s->needzero is set. When a span
-// is allocated to break into small objects, it is zeroed if needed
-// and s->needzero is set. There are two main benefits to delaying the
-// zeroing this way:
-//
-//	1. stack frames allocated from the small object lists
-//	   or the page heap can avoid zeroing altogether.
-//	2. the cost of zeroing when reusing a small object is
-//	   charged to the mutator, not the garbage collector.
-//
-// This C code was written with an eye toward translating to Go
-// in the future.  Methods have the form Type_Method(Type *t, ...).
-
-typedef struct MCentral	MCentral;
-typedef struct MHeap	MHeap;
-typedef struct mspan	MSpan;
-typedef struct mstats	MStats;
-typedef struct mlink	MLink;
-typedef struct mtypes	MTypes;
-typedef struct gcstats	GCStats;
-
-enum
-{
-	PageShift	= 13,
-	PageSize	= 1<<PageShift,
-	PageMask	= PageSize - 1,
-};
-typedef	uintptr	PageID;		// address >> PageShift
-
-enum
-{
-	// Computed constant.  The definition of MaxSmallSize and the
-	// algorithm in msize.c produce some number of different allocation
-	// size classes.  _NumSizeClasses is that number.  It's needed here
-	// because there are static arrays of this length; when msize runs its
-	// size choosing algorithm it double-checks that NumSizeClasses agrees.
-	// _NumSizeClasses is defined in runtime2.go as 67.
-
-	// Tunable constants.
-	MaxSmallSize = 32<<10,
-
-	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
-	TinySize = 16,
-	TinySizeClass = 2,
-
-	FixAllocChunk = 16<<10,		// Chunk size for FixAlloc
-	MaxMHeapList = 1<<(20 - PageShift),	// Maximum page length for fixed-size list in MHeap.
-	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
-
-	// Number of bits in page to span calculations (4k pages).
-	// On Windows 64-bit we limit the arena to 32GB or 35 bits (see below for reason).
-	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
-	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
-#if __SIZEOF_POINTER__ == 8
-#ifdef GOOS_windows
-	// Windows counts memory used by page table into committed memory
-	// of the process, so we can't reserve too much memory.
-	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
-	MHeapMap_Bits = 35 - PageShift,
-#else
-	MHeapMap_Bits = 37 - PageShift,
-#endif
-#else
-	MHeapMap_Bits = 32 - PageShift,
-#endif
-};
-
-// Maximum memory allocation size, a hint for callers.
-// This must be a #define instead of an enum because it
-// is so large.
-#if __SIZEOF_POINTER__ == 8
-#define	MaxMem	(1ULL<<(MHeapMap_Bits+PageShift))	/* 128 GB or 32 GB */
-#else
-#define	MaxMem	((uintptr)-1)
-#endif
-// SysAlloc obtains a large chunk of zeroed memory from the
-// operating system, typically on the order of a hundred kilobytes
-// or a megabyte.
-// NOTE: SysAlloc returns OS-aligned memory, but the heap allocator
-// may use larger alignment, so the caller must be careful to realign the
-// memory obtained by SysAlloc.
-//
-// SysUnused notifies the operating system that the contents
-// of the memory region are no longer needed and can be reused
-// for other purposes.
-// SysUsed notifies the operating system that the contents
-// of the memory region are needed again.
-//
-// SysFree returns it unconditionally; this is only used if
-// an out-of-memory error has been detected midway through
-// an allocation.  It is okay if SysFree is a no-op.
-//
-// SysReserve reserves address space without allocating memory.
-// If the pointer passed to it is non-nil, the caller wants the
-// reservation there, but SysReserve can still choose another
-// location if that one is unavailable.  On some systems and in some
-// cases SysReserve will simply check that the address space is
-// available and not actually reserve it.  If SysReserve returns
-// non-nil, it sets *reserved to true if the address space is
-// reserved, false if it has merely been checked.
-// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
-// may use larger alignment, so the caller must be careful to realign the
-// memory obtained by SysAlloc.
-//
-// SysMap maps previously reserved address space for use.
-// The reserved argument is true if the address space was really
-// reserved, not merely checked.
-//
-// SysFault marks a (already SysAlloc'd) region to fault
-// if accessed.  Used only for debugging the runtime.
-
-void*	runtime_SysAlloc(uintptr nbytes, uint64 *stat)
-  __asm__ (GOSYM_PREFIX "runtime.sysAlloc");
-void	runtime_SysFree(void *v, uintptr nbytes, uint64 *stat)
-  __asm__ (GOSYM_PREFIX "runtime.sysFree");
-void	runtime_SysUnused(void *v, uintptr nbytes);
-void	runtime_SysUsed(void *v, uintptr nbytes);
-void	runtime_SysMap(void *v, uintptr nbytes, bool reserved, uint64 *stat);
-void*	runtime_SysReserve(void *v, uintptr nbytes, bool *reserved);
-void	runtime_SysFault(void *v, uintptr nbytes);
-
-// FixAlloc is a simple free-list allocator for fixed size objects.
-// Malloc uses a FixAlloc wrapped around SysAlloc to manages its
-// MCache and MSpan objects.
-//
-// Memory returned by FixAlloc_Alloc is not zeroed.
-// The caller is responsible for locking around FixAlloc calls.
-// Callers can keep state in the object but the first word is
-// smashed by freeing and reallocating.
-struct FixAlloc
-{
-	uintptr	size;
-	void	(*first)(void *arg, byte *p);	// called first time p is returned
-	void*	arg;
-	MLink*	list;
-	byte*	chunk;
-	uint32	nchunk;
-	uintptr	inuse;	// in-use bytes now
-	uint64*	stat;
-};
-
-void	runtime_FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat);
-void*	runtime_FixAlloc_Alloc(FixAlloc *f);
-void	runtime_FixAlloc_Free(FixAlloc *f, void *p);
-
-extern MStats *mstats(void)
-  __asm__ (GOSYM_PREFIX "runtime.getMstats");
-void	runtime_updatememstats(GCStats *stats)
-  __asm__ (GOSYM_PREFIX "runtime.updatememstats");
-
-// Size classes.  Computed and initialized by InitSizes.
-//
-// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
-//	1 <= sizeclass < _NumSizeClasses, for n.
-//	Size class 0 is reserved to mean "not small".
-//
-// class_to_size[i] = largest size in class i
-// class_to_allocnpages[i] = number of pages to allocate when
-//	making new objects in class i
-
-int32	runtime_SizeToClass(int32);
-uintptr	runtime_roundupsize(uintptr)
-  __asm__(GOSYM_PREFIX "runtime.roundupsize");
-extern	int32	runtime_class_to_size[_NumSizeClasses];
-extern	int32	runtime_class_to_allocnpages[_NumSizeClasses];
-extern	int8	runtime_size_to_class8[1024/8 + 1];
-extern	int8	runtime_size_to_class128[(MaxSmallSize-1024)/128 + 1];
-extern	void	runtime_InitSizes(void);
-
-
-typedef struct mcachelist MCacheList;
-
-MSpan*	runtime_MCache_Refill(MCache *c, int32 sizeclass);
-void	runtime_MCache_Free(MCache *c, MLink *p, int32 sizeclass, uintptr size);
-void	runtime_MCache_ReleaseAll(MCache *c);
-
-// MTypes describes the types of blocks allocated within a span.
-// The compression field describes the layout of the data.
-//
-// MTypes_Empty:
-//     All blocks are free, or no type information is available for
-//     allocated blocks.
-//     The data field has no meaning.
-// MTypes_Single:
-//     The span contains just one block.
-//     The data field holds the type information.
-//     The sysalloc field has no meaning.
-// MTypes_Words:
-//     The span contains multiple blocks.
-//     The data field points to an array of type [NumBlocks]uintptr,
-//     and each element of the array holds the type of the corresponding
-//     block.
-// MTypes_Bytes:
-//     The span contains at most seven different types of blocks.
-//     The data field points to the following structure:
-//         struct {
-//             type  [8]uintptr       // type[0] is always 0
-//             index [NumBlocks]byte
-//         }
-//     The type of the i-th block is: data.type[data.index[i]]
-enum
-{
-	MTypes_Empty = 0,
-	MTypes_Single = 1,
-	MTypes_Words = 2,
-	MTypes_Bytes = 3,
-};
-
-enum
-{
-	KindSpecialFinalizer = 1,
-	KindSpecialProfile = 2,
-	// Note: The finalizer special must be first because if we're freeing
-	// an object, a finalizer special will cause the freeing operation
-	// to abort, and we want to keep the other special records around
-	// if that happens.
-};
-
-typedef struct special Special;
-
-// The described object has a finalizer set for it.
-typedef struct SpecialFinalizer SpecialFinalizer;
-struct SpecialFinalizer
-{
-	Special;
-	FuncVal*	fn;
-	const FuncType*	ft;
-	const PtrType*	ot;
-};
-
-// The described object is being heap profiled.
-typedef struct bucket Bucket; // from mprof.go
-typedef struct SpecialProfile SpecialProfile;
-struct SpecialProfile
-{
-	Special;
-	Bucket*	b;
-};
-
-// An MSpan is a run of pages.
-enum
-{
-	MSpanInUse = 0,
-	MSpanFree,
-	MSpanListHead,
-	MSpanDead,
-};
-
-void	runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages);
-void	runtime_MSpan_EnsureSwept(MSpan *span);
-bool	runtime_MSpan_Sweep(MSpan *span);
-
-// Every MSpan is in one doubly-linked list,
-// either one of the MHeap's free lists or one of the
-// MCentral's span lists.  We use empty MSpan structures as list heads.
-void	runtime_MSpanList_Init(MSpan *list);
-bool	runtime_MSpanList_IsEmpty(MSpan *list);
-void	runtime_MSpanList_Insert(MSpan *list, MSpan *span);
-void	runtime_MSpanList_InsertBack(MSpan *list, MSpan *span);
-void	runtime_MSpanList_Remove(MSpan *span);	// from whatever list it is in
-
-
-// Central list of free objects of a given size.
-struct MCentral
-{
-	Lock;
-	int32 sizeclass;
-	MSpan nonempty;	// list of spans with a free object
-	MSpan mempty;	// list of spans with no free objects (or cached in an MCache)
-	int32 nfree;	// # of objects available in nonempty spans
-};
-
-void	runtime_MCentral_Init(MCentral *c, int32 sizeclass);
-MSpan*	runtime_MCentral_CacheSpan(MCentral *c);
-void	runtime_MCentral_UncacheSpan(MCentral *c, MSpan *s);
-bool	runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
-void	runtime_MCentral_FreeList(MCentral *c, MLink *start); // TODO: need this?
-
-// Main malloc heap.
-// The heap itself is the "free[]" and "large" arrays,
-// but all the other global data is here too.
-struct MHeap
-{
-	Lock;
-	MSpan free[MaxMHeapList];	// free lists of given length
-	MSpan freelarge;		// free lists length >= MaxMHeapList
-	MSpan busy[MaxMHeapList];	// busy lists of large objects of given length
-	MSpan busylarge;		// busy lists of large objects length >= MaxMHeapList
-	MSpan **allspans;		// all spans out there
-	MSpan **sweepspans;		// copy of allspans referenced by sweeper
-	uint32	nspan;
-	uint32	nspancap;
-	uint32	sweepgen;		// sweep generation, see comment in MSpan
-	uint32	sweepdone;		// all spans are swept
-
-	// span lookup
-	MSpan**	spans;
-	uintptr	spans_mapped;
-
-	// range of addresses we might see in the heap
-	byte *bitmap;
-	uintptr bitmap_mapped;
-	byte *arena_start;
-	byte *arena_used;
-	byte *arena_end;
-	bool arena_reserved;
-
-	// central free lists for small size classes.
-	// the padding makes sure that the MCentrals are
-	// spaced CacheLineSize bytes apart, so that each MCentral.Lock
-	// gets its own cache line.
-	struct {
-		MCentral;
-		byte pad[64];
-	} central[_NumSizeClasses];
-
-	FixAlloc spanalloc;	// allocator for Span*
-	FixAlloc cachealloc;	// allocator for MCache*
-	FixAlloc specialfinalizeralloc;	// allocator for SpecialFinalizer*
-	FixAlloc specialprofilealloc;	// allocator for SpecialProfile*
-	Lock speciallock; // lock for sepcial record allocators.
-
-	// Malloc stats.
-	uint64 largefree;	// bytes freed for large objects (>MaxSmallSize)
-	uint64 nlargefree;	// number of frees for large objects (>MaxSmallSize)
-	uint64 nsmallfree[_NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
-};
-extern MHeap runtime_mheap;
-
-void	runtime_MHeap_Init(MHeap *h);
-MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero);
-void	runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct);
-MSpan*	runtime_MHeap_Lookup(MHeap *h, void *v);
-MSpan*	runtime_MHeap_LookupMaybe(MHeap *h, void *v);
-void	runtime_MGetSizeClassInfo(int32 sizeclass, uintptr *size, int32 *npages, int32 *nobj);
-void*	runtime_MHeap_SysAlloc(MHeap *h, uintptr n);
-void	runtime_MHeap_MapBits(MHeap *h);
-void	runtime_MHeap_MapSpans(MHeap *h);
-void	runtime_MHeap_SplitSpan(MHeap *h, MSpan *s);
-
-void*	runtime_mallocgc(uintptr size, uintptr typ, uint32 flag);
-void*	runtime_persistentalloc(uintptr size, uintptr align, uint64 *stat)
-  __asm__(GOSYM_PREFIX "runtime.persistentalloc");
-int32	runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **s);
-void	runtime_gc(int32 force)
-  __asm__(GOSYM_PREFIX "runtime.gc");
-uintptr	runtime_sweepone(void);
-void	runtime_markscan(void *v);
-void	runtime_marknogc(void *v);
-void	runtime_checkallocated(void *v, uintptr n);
-void	runtime_markfreed(void *v);
-void	runtime_checkfreed(void *v, uintptr n);
-extern	int32	runtime_checking;
-void	runtime_markspan(void *v, uintptr size, uintptr n, bool leftover);
-void	runtime_unmarkspan(void *v, uintptr size);
-void	runtime_purgecachedstats(MCache*);
-void*	runtime_cnew(const Type*)
-  __asm__(GOSYM_PREFIX "runtime.newobject");
-void*	runtime_cnewarray(const Type*, intgo)
-  __asm__(GOSYM_PREFIX "runtime.newarray");
-void	runtime_tracealloc(void*, uintptr, uintptr)
-  __asm__ (GOSYM_PREFIX "runtime.tracealloc");
-void	runtime_tracefree(void*, uintptr)
-  __asm__ (GOSYM_PREFIX "runtime.tracefree");
-void	runtime_tracegc(void)
-  __asm__ (GOSYM_PREFIX "runtime.tracegc");
-
-uintptr	runtime_gettype(void*);
-
-enum
-{
-	// flags to malloc
-	FlagNoScan	= 1<<0,	// GC doesn't have to scan object
-	FlagNoProfiling	= 1<<1,	// must not profile
-	FlagNoGC	= 1<<2,	// must not free or scan for pointers
-	FlagNoZero	= 1<<3, // don't zero memory
-	FlagNoInvokeGC	= 1<<4, // don't invoke GC
-};
-
-typedef struct Obj Obj;
-struct Obj
-{
-	byte	*p;	// data pointer
-	uintptr	n;	// size of data in bytes
-	uintptr	ti;	// type info
-};
-
-void	runtime_MProf_Malloc(void*, uintptr)
-  __asm__ (GOSYM_PREFIX "runtime.mProf_Malloc");
-void	runtime_MProf_Free(Bucket*, uintptr, bool)
-  __asm__ (GOSYM_PREFIX "runtime.mProf_Free");
-void	runtime_MProf_GC(void)
-  __asm__ (GOSYM_PREFIX "runtime.mProf_GC");
-void	runtime_iterate_memprof(FuncVal* callback)
-  __asm__ (GOSYM_PREFIX "runtime.iterate_memprof");
-int32	runtime_gcprocs(void)
-  __asm__ (GOSYM_PREFIX "runtime.gcprocs");
-void	runtime_helpgc(int32 nproc)
-  __asm__ (GOSYM_PREFIX "runtime.helpgc");
-void	runtime_gchelper(void)
-  __asm__ (GOSYM_PREFIX "runtime.gchelper");
-
-void	runtime_setprofilebucket(void *p, Bucket *b)
-  __asm__ (GOSYM_PREFIX "runtime.setprofilebucket");
-
-struct __go_func_type;
-struct __go_ptr_type;
-bool	runtime_addfinalizer(void *p, FuncVal *fn, const struct __go_func_type*, const struct __go_ptr_type*)
-  __asm__ (GOSYM_PREFIX "runtime.addfinalizer");
-void	runtime_removefinalizer(void*)
-  __asm__ (GOSYM_PREFIX "runtime.removefinalizer");
-void	runtime_queuefinalizer(void *p, FuncVal *fn, const struct __go_func_type *ft, const struct __go_ptr_type *ot)
-  __asm__ (GOSYM_PREFIX "runtime.queuefinalizer");
-
-void	runtime_freeallspecials(MSpan *span, void *p, uintptr size);
-bool	runtime_freespecial(Special *s, void *p, uintptr size, bool freed);
-
-enum
-{
-	TypeInfo_SingleObject = 0,
-	TypeInfo_Array = 1,
-	TypeInfo_Chan = 2,
-
-	// Enables type information at the end of blocks allocated from heap	
-	DebugTypeAtBlockEnd = 0,
-};
-
-// Information from the compiler about the layout of stack frames.
-typedef struct BitVector BitVector;
-struct BitVector
-{
-	int32 n; // # of bits
-	uint32 *data;
-};
-typedef struct StackMap StackMap;
-struct StackMap
-{
-	int32 n; // number of bitmaps
-	int32 nbit; // number of bits in each bitmap
-	uint32 data[];
-};
-enum {
-	// Pointer map
-	BitsPerPointer = 2,
-	BitsDead = 0,
-	BitsScalar = 1,
-	BitsPointer = 2,
-	BitsMultiWord = 3,
-	// BitsMultiWord will be set for the first word of a multi-word item.
-	// When it is set, one of the following will be set for the second word.
-	BitsString = 0,
-	BitsSlice = 1,
-	BitsIface = 2,
-	BitsEface = 3,
-};
-// Returns pointer map data for the given stackmap index
-// (the index is encoded in PCDATA_StackMapIndex).
-BitVector	runtime_stackmapdata(StackMap *stackmap, int32 n);
-
-// defined in mgc0.go
-void	runtime_gc_m_ptr(Eface*);
-void	runtime_gc_g_ptr(Eface*);
-void	runtime_gc_itab_ptr(Eface*);
-
-void	runtime_memorydump(void);
-int32	runtime_setgcpercent(int32)
-  __asm__ (GOSYM_PREFIX "runtime.setgcpercent");
-
-// Value we use to mark dead pointers when GODEBUG=gcdead=1.
-#define PoisonGC ((uintptr)0xf969696969696969ULL)
-#define PoisonStack ((uintptr)0x6868686868686868ULL)
-
-struct Workbuf;
diff --git a/libgo/runtime/mcache.c b/libgo/runtime/mcache.c
deleted file mode 100644
index 46684bc..0000000
--- a/libgo/runtime/mcache.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Per-P malloc cache for small objects.
-//
-// See malloc.h for an overview.
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-extern volatile intgo runtime_MemProfileRate
-  __asm__ (GOSYM_PREFIX "runtime.MemProfileRate");
-
-// dummy MSpan that contains no free objects.
-static MSpan emptymspan;
-
-MCache*
-runtime_allocmcache(void)
-{
-	intgo rate;
-	MCache *c;
-	int32 i;
-
-	runtime_lock(&runtime_mheap);
-	c = runtime_FixAlloc_Alloc(&runtime_mheap.cachealloc);
-	runtime_unlock(&runtime_mheap);
-	runtime_memclr((byte*)c, sizeof(*c));
-	for(i = 0; i < _NumSizeClasses; i++)
-		c->alloc[i] = &emptymspan;
-
-	// Set first allocation sample size.
-	rate = runtime_MemProfileRate;
-	if(rate > 0x3fffffff)	// make 2*rate not overflow
-		rate = 0x3fffffff;
-	if(rate != 0)
-		c->next_sample = runtime_fastrand() % (2*rate);
-
-	return c;
-}
-
-void
-runtime_freemcache(MCache *c)
-{
-	runtime_MCache_ReleaseAll(c);
-	runtime_lock(&runtime_mheap);
-	runtime_purgecachedstats(c);
-	runtime_FixAlloc_Free(&runtime_mheap.cachealloc, c);
-	runtime_unlock(&runtime_mheap);
-}
-
-// Gets a span that has a free object in it and assigns it
-// to be the cached span for the given sizeclass.  Returns this span.
-MSpan*
-runtime_MCache_Refill(MCache *c, int32 sizeclass)
-{
-	MCacheList *l;
-	MSpan *s;
-
-	runtime_m()->locks++;
-	// Return the current cached span to the central lists.
-	s = c->alloc[sizeclass];
-	if(s->freelist != nil)
-		runtime_throw("refill on a nonempty span");
-	if(s != &emptymspan)
-		runtime_MCentral_UncacheSpan(&runtime_mheap.central[sizeclass], s);
-
-	// Push any explicitly freed objects to the central lists.
-	// Not required, but it seems like a good time to do it.
-	l = &c->free[sizeclass];
-	if(l->nlist > 0) {
-		runtime_MCentral_FreeList(&runtime_mheap.central[sizeclass], l->list);
-		l->list = nil;
-		l->nlist = 0;
-	}
-
-	// Get a new cached span from the central lists.
-	s = runtime_MCentral_CacheSpan(&runtime_mheap.central[sizeclass]);
-	if(s == nil)
-		runtime_throw("out of memory");
-	if(s->freelist == nil) {
-		runtime_printf("%d %d\n", s->ref, (int32)((s->npages << PageShift) / s->elemsize));
-		runtime_throw("empty span");
-	}
-	c->alloc[sizeclass] = s;
-	runtime_m()->locks--;
-	return s;
-}
-
-void
-runtime_MCache_Free(MCache *c, MLink *p, int32 sizeclass, uintptr size)
-{
-	MCacheList *l;
-
-	// Put on free list.
-	l = &c->free[sizeclass];
-	p->next = l->list;
-	l->list = p;
-	l->nlist++;
-
-	// We transfer a span at a time from MCentral to MCache,
-	// so we'll do the same in the other direction.
-	if(l->nlist >= (runtime_class_to_allocnpages[sizeclass]<<PageShift)/size) {
-		runtime_MCentral_FreeList(&runtime_mheap.central[sizeclass], l->list);
-		l->list = nil;
-		l->nlist = 0;
-	}
-}
-
-void
-runtime_MCache_ReleaseAll(MCache *c)
-{
-	int32 i;
-	MSpan *s;
-	MCacheList *l;
-
-	for(i=0; i<_NumSizeClasses; i++) {
-		s = c->alloc[i];
-		if(s != &emptymspan) {
-			runtime_MCentral_UncacheSpan(&runtime_mheap.central[i], s);
-			c->alloc[i] = &emptymspan;
-		}
-		l = &c->free[i];
-		if(l->nlist > 0) {
-			runtime_MCentral_FreeList(&runtime_mheap.central[i], l->list);
-			l->list = nil;
-			l->nlist = 0;
-		}
-	}
-}
diff --git a/libgo/runtime/mcentral.c b/libgo/runtime/mcentral.c
deleted file mode 100644
index 491cac5..0000000
--- a/libgo/runtime/mcentral.c
+++ /dev/null
@@ -1,307 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Central free lists.
-//
-// See malloc.h for an overview.
-//
-// The MCentral doesn't actually contain the list of free objects; the MSpan does.
-// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
-// and those that are completely allocated (c->mempty).
-//
-// TODO(rsc): tcmalloc uses a "transfer cache" to split the list
-// into sections of class_to_transfercount[sizeclass] objects
-// so that it is faster to move those lists between MCaches and MCentrals.
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-static bool MCentral_Grow(MCentral *c);
-static void MCentral_Free(MCentral *c, MLink *v);
-static void MCentral_ReturnToHeap(MCentral *c, MSpan *s);
-
-// Initialize a single central free list.
-void
-runtime_MCentral_Init(MCentral *c, int32 sizeclass)
-{
-	c->sizeclass = sizeclass;
-	runtime_MSpanList_Init(&c->nonempty);
-	runtime_MSpanList_Init(&c->mempty);
-}
-
-// Allocate a span to use in an MCache.
-MSpan*
-runtime_MCentral_CacheSpan(MCentral *c)
-{
-	MSpan *s;
-	int32 cap, n;
-	uint32 sg;
-
-	runtime_lock(c);
-	sg = runtime_mheap.sweepgen;
-retry:
-	for(s = c->nonempty.next; s != &c->nonempty; s = s->next) {
-		if(s->sweepgen == sg-2 && runtime_cas(&s->sweepgen, sg-2, sg-1)) {
-			runtime_unlock(c);
-			runtime_MSpan_Sweep(s);
-			runtime_lock(c);
-			// the span could have been moved to heap, retry
-			goto retry;
-		}
-		if(s->sweepgen == sg-1) {
-			// the span is being swept by background sweeper, skip
-			continue;
-		}
-		// we have a nonempty span that does not require sweeping, allocate from it
-		goto havespan;
-	}
-
-	for(s = c->mempty.next; s != &c->mempty; s = s->next) {
-		if(s->sweepgen == sg-2 && runtime_cas(&s->sweepgen, sg-2, sg-1)) {
-			// we have an empty span that requires sweeping,
-			// sweep it and see if we can free some space in it
-			runtime_MSpanList_Remove(s);
-			// swept spans are at the end of the list
-			runtime_MSpanList_InsertBack(&c->mempty, s);
-			runtime_unlock(c);
-			runtime_MSpan_Sweep(s);
-			runtime_lock(c);
-			// the span could be moved to nonempty or heap, retry
-			goto retry;
-		}
-		if(s->sweepgen == sg-1) {
-			// the span is being swept by background sweeper, skip
-			continue;
-		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break;
-	}
-
-	// Replenish central list if empty.
-	if(!MCentral_Grow(c)) {
-		runtime_unlock(c);
-		return nil;
-	}
-	goto retry;
-
-havespan:
-	cap = (s->npages << PageShift) / s->elemsize;
-	n = cap - s->ref;
-	if(n == 0)
-		runtime_throw("empty span");
-	if(s->freelist == nil)
-		runtime_throw("freelist empty");
-	c->nfree -= n;
-	runtime_MSpanList_Remove(s);
-	runtime_MSpanList_InsertBack(&c->mempty, s);
-	s->incache = true;
-	runtime_unlock(c);
-	return s;
-}
-
-// Return span from an MCache.
-void
-runtime_MCentral_UncacheSpan(MCentral *c, MSpan *s)
-{
-	MLink *v;
-	int32 cap, n;
-
-	runtime_lock(c);
-
-	s->incache = false;
-
-	// Move any explicitly freed items from the freebuf to the freelist.
-	while((v = s->freebuf) != nil) {
-		s->freebuf = v->next;
-		runtime_markfreed(v);
-		v->next = s->freelist;
-		s->freelist = v;
-		s->ref--;
-	}
-
-	if(s->ref == 0) {
-		// Free back to heap.  Unlikely, but possible.
-		MCentral_ReturnToHeap(c, s); // unlocks c
-		return;
-	}
-	
-	cap = (s->npages << PageShift) / s->elemsize;
-	n = cap - s->ref;
-	if(n > 0) {
-		c->nfree += n;
-		runtime_MSpanList_Remove(s);
-		runtime_MSpanList_Insert(&c->nonempty, s);
-	}
-	runtime_unlock(c);
-}
-
-// Free the list of objects back into the central free list c.
-// Called from runtime_free.
-void
-runtime_MCentral_FreeList(MCentral *c, MLink *start)
-{
-	MLink *next;
-
-	runtime_lock(c);
-	for(; start != nil; start = next) {
-		next = start->next;
-		MCentral_Free(c, start);
-	}
-	runtime_unlock(c);
-}
-
-// Helper: free one object back into the central free list.
-// Caller must hold lock on c on entry.  Holds lock on exit.
-static void
-MCentral_Free(MCentral *c, MLink *v)
-{
-	MSpan *s;
-
-	// Find span for v.
-	s = runtime_MHeap_Lookup(&runtime_mheap, v);
-	if(s == nil || s->ref == 0)
-		runtime_throw("invalid free");
-	if(s->sweepgen != runtime_mheap.sweepgen)
-		runtime_throw("free into unswept span");
-	
-	// If the span is currently being used unsynchronized by an MCache,
-	// we can't modify the freelist.  Add to the freebuf instead.  The
-	// items will get moved to the freelist when the span is returned
-	// by the MCache.
-	if(s->incache) {
-		v->next = s->freebuf;
-		s->freebuf = v;
-		return;
-	}
-
-	// Move span to nonempty if necessary.
-	if(s->freelist == nil) {
-		runtime_MSpanList_Remove(s);
-		runtime_MSpanList_Insert(&c->nonempty, s);
-	}
-
-	// Add the object to span's free list.
-	runtime_markfreed(v);
-	v->next = s->freelist;
-	s->freelist = v;
-	s->ref--;
-	c->nfree++;
-
-	// If s is completely freed, return it to the heap.
-	if(s->ref == 0) {
-		MCentral_ReturnToHeap(c, s); // unlocks c
-		runtime_lock(c);
-	}
-}
-
-// Free n objects from a span s back into the central free list c.
-// Called during sweep.
-// Returns true if the span was returned to heap.  Sets sweepgen to
-// the latest generation.
-bool
-runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end)
-{
-	if(s->incache)
-		runtime_throw("freespan into cached span");
-	runtime_lock(c);
-
-	// Move to nonempty if necessary.
-	if(s->freelist == nil) {
-		runtime_MSpanList_Remove(s);
-		runtime_MSpanList_Insert(&c->nonempty, s);
-	}
-
-	// Add the objects back to s's free list.
-	end->next = s->freelist;
-	s->freelist = start;
-	s->ref -= n;
-	c->nfree += n;
-	
-	// delay updating sweepgen until here.  This is the signal that
-	// the span may be used in an MCache, so it must come after the
-	// linked list operations above (actually, just after the
-	// lock of c above.)
-	runtime_atomicstore(&s->sweepgen, runtime_mheap.sweepgen);
-
-	if(s->ref != 0) {
-		runtime_unlock(c);
-		return false;
-	}
-
-	// s is completely freed, return it to the heap.
-	MCentral_ReturnToHeap(c, s); // unlocks c
-	return true;
-}
-
-void
-runtime_MGetSizeClassInfo(int32 sizeclass, uintptr *sizep, int32 *npagesp, int32 *nobj)
-{
-	int32 size;
-	int32 npages;
-
-	npages = runtime_class_to_allocnpages[sizeclass];
-	size = runtime_class_to_size[sizeclass];
-	*npagesp = npages;
-	*sizep = size;
-	*nobj = (npages << PageShift) / size;
-}
-
-// Fetch a new span from the heap and
-// carve into objects for the free list.
-static bool
-MCentral_Grow(MCentral *c)
-{
-	int32 i, n, npages;
-	uintptr size;
-	MLink **tailp, *v;
-	byte *p;
-	MSpan *s;
-
-	runtime_unlock(c);
-	runtime_MGetSizeClassInfo(c->sizeclass, &size, &npages, &n);
-	s = runtime_MHeap_Alloc(&runtime_mheap, npages, c->sizeclass, 0, 1);
-	if(s == nil) {
-		// TODO(rsc): Log out of memory
-		runtime_lock(c);
-		return false;
-	}
-
-	// Carve span into sequence of blocks.
-	tailp = &s->freelist;
-	p = (byte*)(s->start << PageShift);
-	s->limit = (uintptr)(p + size*n);
-	for(i=0; i<n; i++) {
-		v = (MLink*)p;
-		*tailp = v;
-		tailp = &v->next;
-		p += size;
-	}
-	*tailp = nil;
-	runtime_markspan((byte*)(s->start<<PageShift), size, n, size*n < (s->npages<<PageShift));
-
-	runtime_lock(c);
-	c->nfree += n;
-	runtime_MSpanList_Insert(&c->nonempty, s);
-	return true;
-}
-
-// Return s to the heap.  s must be unused (s->ref == 0).  Unlocks c.
-static void
-MCentral_ReturnToHeap(MCentral *c, MSpan *s)
-{
-	int32 size;
-
-	size = runtime_class_to_size[c->sizeclass];
-	runtime_MSpanList_Remove(s);
-	s->needzero = 1;
-	s->freelist = nil;
-	if(s->ref != 0)
-		runtime_throw("ref wrong");
-	c->nfree -= (s->npages << PageShift) / size;
-	runtime_unlock(c);
-	runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-	runtime_MHeap_Free(&runtime_mheap, s, 0);
-}
diff --git a/libgo/runtime/mem.c b/libgo/runtime/mem.c
deleted file mode 100644
index 0a8e325..0000000
--- a/libgo/runtime/mem.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Defining _XOPEN_SOURCE hides the declaration of madvise() on Solaris <
-   11 and the MADV_DONTNEED definition on IRIX 6.5.  */
-#undef _XOPEN_SOURCE
-
-#include <errno.h>
-#include <unistd.h>
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-#ifndef MAP_ANON
-#ifdef MAP_ANONYMOUS
-#define MAP_ANON MAP_ANONYMOUS
-#else
-#define USE_DEV_ZERO
-#define MAP_ANON 0
-#endif
-#endif
-
-#ifndef MAP_NORESERVE
-#define MAP_NORESERVE 0
-#endif
-
-#ifdef USE_DEV_ZERO
-static int dev_zero = -1;
-#endif
-
-static int32
-addrspace_free(void *v __attribute__ ((unused)), uintptr n __attribute__ ((unused)))
-{
-#ifdef HAVE_MINCORE
-	size_t page_size = getpagesize();
-	int32 errval;
-	uintptr chunk;
-	uintptr off;
-	
-	// NOTE: vec must be just 1 byte long here.
-	// Mincore returns ENOMEM if any of the pages are unmapped,
-	// but we want to know that all of the pages are unmapped.
-	// To make these the same, we can only ask about one page
-	// at a time. See golang.org/issue/7476.
-	static byte vec[1];
-
-	errno = 0;
-	for(off = 0; off < n; off += chunk) {
-		chunk = page_size * sizeof vec;
-		if(chunk > (n - off))
-			chunk = n - off;
-		errval = mincore((char*)v + off, chunk, (void*)vec);
-		// ENOMEM means unmapped, which is what we want.
-		// Anything else we assume means the pages are mapped.
-		if(errval == 0 || errno != ENOMEM)
-			return 0;
-	}
-#endif
-	return 1;
-}
-
-static void *
-mmap_fixed(byte *v, uintptr n, int32 prot, int32 flags, int32 fd, uint32 offset)
-{
-	void *p;
-
-	p = runtime_mmap((void *)v, n, prot, flags, fd, offset);
-	if(p != v && addrspace_free(v, n)) {
-		// On some systems, mmap ignores v without
-		// MAP_FIXED, so retry if the address space is free.
-		if(p != MAP_FAILED)
-			runtime_munmap(p, n);
-		p = runtime_mmap((void *)v, n, prot, flags|MAP_FIXED, fd, offset);
-	}
-	return p;
-}
-
-void*
-runtime_SysAlloc(uintptr n, uint64 *stat)
-{
-	void *p;
-	int fd = -1;
-
-#ifdef USE_DEV_ZERO
-	if (dev_zero == -1) {
-		dev_zero = open("/dev/zero", O_RDONLY);
-		if (dev_zero < 0) {
-			runtime_printf("open /dev/zero: errno=%d\n", errno);
-			exit(2);
-		}
-	}
-	fd = dev_zero;
-#endif
-
-	p = runtime_mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, fd, 0);
-	if (p == MAP_FAILED) {
-		if(errno == EACCES) {
-			runtime_printf("runtime: mmap: access denied\n");
-			runtime_printf("if you're running SELinux, enable execmem for this process.\n");
-			exit(2);
-		}
-		if(errno == EAGAIN) {
-			runtime_printf("runtime: mmap: too much locked memory (check 'ulimit -l').\n");
-			runtime_exit(2);
-		}
-		return nil;
-	}
-	runtime_xadd64(stat, n);
-	return p;
-}
-
-void
-runtime_SysUnused(void *v __attribute__ ((unused)), uintptr n __attribute__ ((unused)))
-{
-#ifdef MADV_DONTNEED
-	runtime_madvise(v, n, MADV_DONTNEED);
-#endif
-}
-
-void
-runtime_SysUsed(void *v, uintptr n)
-{
-	USED(v);
-	USED(n);
-}
-
-void
-runtime_SysFree(void *v, uintptr n, uint64 *stat)
-{
-	runtime_xadd64(stat, -(uint64)n);
-	runtime_munmap(v, n);
-}
-
-void
-runtime_SysFault(void *v, uintptr n)
-{
-	int fd = -1;
-
-#ifdef USE_DEV_ZERO
-	if (dev_zero == -1) {
-		dev_zero = open("/dev/zero", O_RDONLY);
-		if (dev_zero < 0) {
-			runtime_printf("open /dev/zero: errno=%d\n", errno);
-			exit(2);
-		}
-	}
-	fd = dev_zero;
-#endif
-
-	runtime_mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, fd, 0);
-}
-
-void*
-runtime_SysReserve(void *v, uintptr n, bool *reserved)
-{
-	int fd = -1;
-	void *p;
-
-#ifdef USE_DEV_ZERO
-	if (dev_zero == -1) {
-		dev_zero = open("/dev/zero", O_RDONLY);
-		if (dev_zero < 0) {
-			runtime_printf("open /dev/zero: errno=%d\n", errno);
-			exit(2);
-		}
-	}
-	fd = dev_zero;
-#endif
-
-	// On 64-bit, people with ulimit -v set complain if we reserve too
-	// much address space.  Instead, assume that the reservation is okay
-	// if we can reserve at least 64K and check the assumption in SysMap.
-	// Only user-mode Linux (UML) rejects these requests.
-	if(sizeof(void*) == 8 && (n >> 16) > 1LLU<<16) {
-		p = mmap_fixed(v, 64<<10, PROT_NONE, MAP_ANON|MAP_PRIVATE, fd, 0);
-		if (p != v) {
-			runtime_munmap(p, 64<<10);
-			return nil;
-		}
-		runtime_munmap(p, 64<<10);
-		*reserved = false;
-		return v;
-	}
-	
-	// Use the MAP_NORESERVE mmap() flag here because typically most of
-	// this reservation will never be used. It does not make sense
-	// reserve a huge amount of unneeded swap space. This is important on
-	// systems which do not overcommit memory by default.
-	p = runtime_mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_NORESERVE, fd, 0);
-	if(p == MAP_FAILED)
-		return nil;
-	*reserved = true;
-	return p;
-}
-
-void
-runtime_SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
-{
-	void *p;
-	int fd = -1;
-	
-	runtime_xadd64(stat, n);
-
-#ifdef USE_DEV_ZERO
-	if (dev_zero == -1) {
-		dev_zero = open("/dev/zero", O_RDONLY);
-		if (dev_zero < 0) {
-			runtime_printf("open /dev/zero: errno=%d\n", errno);
-			exit(2);
-		}
-	}
-	fd = dev_zero;
-#endif
-
-	// On 64-bit, we don't actually have v reserved, so tread carefully.
-	if(!reserved) {
-		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, fd, 0);
-		if(p == MAP_FAILED && errno == ENOMEM)
-			runtime_throw("runtime: out of memory");
-		if(p != v) {
-			runtime_printf("runtime: address space conflict: map(%p) = %p\n", v, p);
-			runtime_throw("runtime: address space conflict");
-		}
-		return;
-	}
-
-#ifdef _AIX
-	// AIX does not allow mapping a range that is already mapped.
-	// So always unmap first even if it is already unmapped.
-	runtime_munmap(v, n);
-#endif
-
-	p = runtime_mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, fd, 0);
-	if(p == MAP_FAILED && errno == ENOMEM)
-		runtime_throw("runtime: out of memory");
-	if(p != v)
-		runtime_throw("runtime: cannot map pages in arena address space");
-}
diff --git a/libgo/runtime/mem_posix_memalign.c b/libgo/runtime/mem_posix_memalign.c
deleted file mode 100644
index 853b5c7..0000000
--- a/libgo/runtime/mem_posix_memalign.c
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <errno.h>
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-void*
-runtime_SysAlloc(uintptr n)
-{
-	void *p;
-
-	mstats()->sys += n;
-	errno = posix_memalign(&p, PageSize, n);
-	if (errno > 0) {
-		perror("posix_memalign");
-		exit(2);
-	}
-	return p;
-}
-
-void
-runtime_SysUnused(void *v, uintptr n)
-{
-	USED(v);
-	USED(n);
-	// TODO(rsc): call madvise MADV_DONTNEED
-}
-
-void
-runtime_SysFree(void *v, uintptr n)
-{
-	mstats()->sys -= n;
-	free(v);
-}
-
-void*
-runtime_SysReserve(void *v, uintptr n)
-{
-	USED(v);
-	return runtime_SysAlloc(n);
-}
-
-void
-runtime_SysMap(void *v, uintptr n)
-{
-	USED(v);
-	USED(n);
-}
diff --git a/libgo/runtime/mfixalloc.c b/libgo/runtime/mfixalloc.c
deleted file mode 100644
index 9d0b3bb..0000000
--- a/libgo/runtime/mfixalloc.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Fixed-size object allocator.  Returned memory is not zeroed.
-//
-// See malloc.h for overview.
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-// Initialize f to allocate objects of the given size,
-// using the allocator to obtain chunks of memory.
-void
-runtime_FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat)
-{
-	f->size = size;
-	f->first = first;
-	f->arg = arg;
-	f->list = nil;
-	f->chunk = nil;
-	f->nchunk = 0;
-	f->inuse = 0;
-	f->stat = stat;
-}
-
-void*
-runtime_FixAlloc_Alloc(FixAlloc *f)
-{
-	void *v;
-	
-	if(f->size == 0) {
-		runtime_printf("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n");
-		runtime_throw("runtime: internal error");
-	}
-
-	if(f->list) {
-		v = f->list;
-		f->list = *(void**)f->list;
-		f->inuse += f->size;
-		return v;
-	}
-	if(f->nchunk < f->size) {
-		f->chunk = runtime_persistentalloc(FixAllocChunk, 0, f->stat);
-		f->nchunk = FixAllocChunk;
-	}
-	v = f->chunk;
-	if(f->first)
-		f->first(f->arg, v);
-	f->chunk += f->size;
-	f->nchunk -= f->size;
-	f->inuse += f->size;
-	return v;
-}
-
-void
-runtime_FixAlloc_Free(FixAlloc *f, void *p)
-{
-	f->inuse -= f->size;
-	*(void**)p = f->list;
-	f->list = p;
-}
-
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c
deleted file mode 100644
index 9a15a1d..0000000
--- a/libgo/runtime/mgc0.c
+++ /dev/null
@@ -1,2547 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Garbage collector (GC).
-//
-// GC is:
-// - mark&sweep
-// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
-// - parallel (up to _MaxGcproc threads)
-// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
-// - non-moving/non-compacting
-// - full (non-partial)
-//
-// GC rate.
-// Next GC is after we've allocated an extra amount of memory proportional to
-// the amount already in use. The proportion is controlled by GOGC environment variable
-// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
-// (and also the amount of extra memory used).
-//
-// Concurrent sweep.
-// The sweep phase proceeds concurrently with normal program execution.
-// The heap is swept span-by-span both lazily (when a goroutine needs another span)
-// and concurrently in a background goroutine (this helps programs that are not CPU bound).
-// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
-// and so next_gc calculation is tricky and happens as follows.
-// At the end of the stop-the-world phase next_gc is conservatively set based on total
-// heap size; all spans are marked as "needs sweeping".
-// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
-// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
-// closer to the target value. However, this is not enough to avoid over-allocating memory.
-// Consider that a goroutine wants to allocate a new span for a large object and
-// there are no free swept spans, but there are small-object unswept spans.
-// If the goroutine naively allocates a new span, it can surpass the yet-unknown
-// target next_gc value. In order to prevent such cases (1) when a goroutine needs
-// to allocate a new small-object span, it sweeps small-object spans for the same
-// object size until it frees at least one object; (2) when a goroutine needs to
-// allocate large-object span from heap, it sweeps spans until it frees at least
-// that many pages into heap. Together these two measures ensure that we don't surpass
-// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
-// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
-// but there can still be other one-page unswept spans which could be combined into a two-page span.
-// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
-// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
-// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
-// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
-// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
-// The finalizer goroutine is kicked off only when all spans are swept.
-// When the next GC starts, it sweeps all not-yet-swept spans (if any).
-
-#include <unistd.h>
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "mgc0.h"
-#include "go-type.h"
-
-// Map gccgo field names to gc field names.
-// Slice aka __go_open_array.
-#define array __values
-#define cap __capacity
-// Hmap aka __go_map
-typedef struct __go_map Hmap;
-// Type aka __go_type_descriptor
-#define string __reflection
-// PtrType aka __go_ptr_type
-#define elem __element_type
-
-#ifdef USING_SPLIT_STACK
-
-extern void * __splitstack_find (void *, void *, size_t *, void **, void **,
-				 void **);
-
-extern void * __splitstack_find_context (void *context[10], size_t *, void **,
-					 void **, void **);
-
-#endif
-
-enum {
-	Debug = 0,
-	CollectStats = 0,
-	ConcurrentSweep = 1,
-
-	WorkbufSize	= 16*1024,
-
-	handoffThreshold = 4,
-	IntermediateBufferCapacity = 64,
-
-	// Bits in type information
-	PRECISE = 1,
-	LOOP = 2,
-	PC_BITS = PRECISE | LOOP,
-
-	RootData	= 0,
-	RootBss		= 1,
-	RootFinalizers	= 2,
-	RootSpanTypes	= 3,
-	RootFlushCaches = 4,
-	RootCount	= 5,
-};
-
-#define GcpercentUnknown (-2)
-
-// Initialized from $GOGC.  GOGC=off means no gc.
-static int32 gcpercent = GcpercentUnknown;
-
-static FuncVal* poolcleanup;
-
-void sync_runtime_registerPoolCleanup(FuncVal*)
-  __asm__ (GOSYM_PREFIX "sync.runtime_registerPoolCleanup");
-
-void
-sync_runtime_registerPoolCleanup(FuncVal *f)
-{
-	poolcleanup = f;
-}
-
-static void
-clearpools(void)
-{
-	P *p, **pp;
-	MCache *c;
-	Defer *d, *dlink;
-
-	// clear sync.Pool's
-	if(poolcleanup != nil) {
-		__builtin_call_with_static_chain(poolcleanup->fn(),
-						 poolcleanup);
-	}
-
-	for(pp=runtime_getAllP(); (p=*pp) != nil; pp++) {
-		// clear tinyalloc pool
-		c = p->mcache;
-		if(c != nil) {
-			c->tiny = nil;
-			c->tinysize = 0;
-		}
-	}
-
-	// Clear central defer pools.
-	// Leave per-P pools alone, they have strictly bounded size.
-	runtime_lock(&runtime_sched->deferlock);
-	for(d = runtime_sched->deferpool; d != nil; d = dlink) {
-		dlink = d->link;
-		d->link = nil;
-	}
-	runtime_sched->deferpool = nil;
-	runtime_unlock(&runtime_sched->deferlock);
-}
-
-typedef struct Workbuf Workbuf;
-struct Workbuf
-{
-#define SIZE (WorkbufSize-sizeof(LFNode)-sizeof(uintptr))
-	LFNode  node; // must be first
-	uintptr nobj;
-	Obj     obj[SIZE/sizeof(Obj) - 1];
-	uint8   _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
-#undef SIZE
-};
-
-typedef struct Finalizer Finalizer;
-struct Finalizer
-{
-	FuncVal *fn;
-	void *arg;
-	const struct __go_func_type *ft;
-	const PtrType *ot;
-};
-
-typedef struct finblock FinBlock;
-
-extern FinBlock *runtime_getallfin()
-  __asm__(GOSYM_PREFIX "runtime.getallfin");
-
-static Lock	gclock;
-
-static void	bgsweep(void*);
-static Workbuf* getempty(Workbuf*);
-static Workbuf* getfull(Workbuf*);
-static void	putempty(Workbuf*);
-static Workbuf* handoff(Workbuf*);
-static void	gchelperstart(void);
-static void	flushallmcaches(void);
-static void	addstackroots(G *gp, Workbuf **wbufp);
-
-static struct {
-	uint64	full;  // lock-free list of full blocks
-	uint64	wempty; // lock-free list of empty blocks
-	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
-	uint32	nproc;
-	int64	tstart;
-	volatile uint32	nwait;
-	volatile uint32	ndone;
-	Note	alldone;
-	ParFor	*markfor;
-
-	Lock;
-	byte	*chunk;
-	uintptr	nchunk;
-} work __attribute__((aligned(8)));
-
-enum {
-	GC_DEFAULT_PTR = GC_NUM_INSTR,
-	GC_CHAN,
-
-	GC_NUM_INSTR2
-};
-
-static struct {
-	struct {
-		uint64 sum;
-		uint64 cnt;
-	} ptr;
-	uint64 nbytes;
-	struct {
-		uint64 sum;
-		uint64 cnt;
-		uint64 notype;
-		uint64 typelookup;
-	} obj;
-	uint64 rescan;
-	uint64 rescanbytes;
-	uint64 instr[GC_NUM_INSTR2];
-	uint64 putempty;
-	uint64 getfull;
-	struct {
-		uint64 foundbit;
-		uint64 foundword;
-		uint64 foundspan;
-	} flushptrbuf;
-	struct {
-		uint64 foundbit;
-		uint64 foundword;
-		uint64 foundspan;
-	} markonly;
-	uint32 nbgsweep;
-	uint32 npausesweep;
-} gcstats;
-
-// markonly marks an object. It returns true if the object
-// has been marked by this function, false otherwise.
-// This function doesn't append the object to any buffer.
-static bool
-markonly(const void *obj)
-{
-	byte *p;
-	uintptr *bitp, bits, shift, x, xbits, off, j;
-	MSpan *s;
-	PageID k;
-
-	// Words outside the arena cannot be pointers.
-	if((const byte*)obj < runtime_mheap.arena_start || (const byte*)obj >= runtime_mheap.arena_used)
-		return false;
-
-	// obj may be a pointer to a live object.
-	// Try to find the beginning of the object.
-
-	// Round down to word boundary.
-	obj = (const void*)((uintptr)obj & ~((uintptr)PtrSize-1));
-
-	// Find bits for this word.
-	off = (const uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
-	bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	xbits = *bitp;
-	bits = xbits >> shift;
-
-	// Pointing at the beginning of a block?
-	if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
-		if(CollectStats)
-			runtime_xadd64(&gcstats.markonly.foundbit, 1);
-		goto found;
-	}
-
-	// Pointing just past the beginning?
-	// Scan backward a little to find a block boundary.
-	for(j=shift; j-->0; ) {
-		if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
-			shift = j;
-			bits = xbits>>shift;
-			if(CollectStats)
-				runtime_xadd64(&gcstats.markonly.foundword, 1);
-			goto found;
-		}
-	}
-
-	// Otherwise consult span table to find beginning.
-	// (Manually inlined copy of MHeap_LookupMaybe.)
-	k = (uintptr)obj>>PageShift;
-	x = k;
-	x -= (uintptr)runtime_mheap.arena_start>>PageShift;
-	s = runtime_mheap.spans[x];
-	if(s == nil || k < s->start || (uintptr)obj >= s->limit || s->state != MSpanInUse)
-		return false;
-	p = (byte*)((uintptr)s->start<<PageShift);
-	if(s->sizeclass == 0) {
-		obj = p;
-	} else {
-		uintptr size = s->elemsize;
-		int32 i = ((const byte*)obj - p)/size;
-		obj = p+i*size;
-	}
-
-	// Now that we know the object header, reload bits.
-	off = (const uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
-	bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	xbits = *bitp;
-	bits = xbits >> shift;
-	if(CollectStats)
-		runtime_xadd64(&gcstats.markonly.foundspan, 1);
-
-found:
-	// Now we have bits, bitp, and shift correct for
-	// obj pointing at the base of the object.
-	// Only care about allocated and not marked.
-	if((bits & (bitAllocated|bitMarked)) != bitAllocated)
-		return false;
-	if(work.nproc == 1)
-		*bitp |= bitMarked<<shift;
-	else {
-		for(;;) {
-			x = *bitp;
-			if(x & (bitMarked<<shift))
-				return false;
-			if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
-				break;
-		}
-	}
-
-	// The object is now marked
-	return true;
-}
-
-// PtrTarget is a structure used by intermediate buffers.
-// The intermediate buffers hold GC data before it
-// is moved/flushed to the work buffer (Workbuf).
-// The size of an intermediate buffer is very small,
-// such as 32 or 64 elements.
-typedef struct PtrTarget PtrTarget;
-struct PtrTarget
-{
-	void *p;
-	uintptr ti;
-};
-
-typedef	struct Scanbuf Scanbuf;
-struct	Scanbuf
-{
-	struct {
-		PtrTarget *begin;
-		PtrTarget *end;
-		PtrTarget *pos;
-	} ptr;
-	struct {
-		Obj *begin;
-		Obj *end;
-		Obj *pos;
-	} obj;
-	Workbuf *wbuf;
-	Obj *wp;
-	uintptr nobj;
-};
-
-typedef struct BufferList BufferList;
-struct BufferList
-{
-	PtrTarget ptrtarget[IntermediateBufferCapacity];
-	Obj obj[IntermediateBufferCapacity];
-	uint32 busy;
-	byte pad[CacheLineSize];
-};
-static BufferList bufferList[_MaxGcproc];
-
-static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
-
-// flushptrbuf moves data from the PtrTarget buffer to the work buffer.
-// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
-// while the work buffer contains blocks which have been marked
-// and are prepared to be scanned by the garbage collector.
-//
-// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
-//
-// A simplified drawing explaining how the todo-list moves from a structure to another:
-//
-//     scanblock
-//  (find pointers)
-//    Obj ------> PtrTarget (pointer targets)
-//     ↑          |
-//     |          |
-//     `----------'
-//     flushptrbuf
-//  (find block start, mark and enqueue)
-static void
-flushptrbuf(Scanbuf *sbuf)
-{
-	byte *p, *arena_start, *obj;
-	uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
-	MSpan *s;
-	PageID k;
-	Obj *wp;
-	Workbuf *wbuf;
-	PtrTarget *ptrbuf;
-	PtrTarget *ptrbuf_end;
-
-	arena_start = runtime_mheap.arena_start;
-
-	wp = sbuf->wp;
-	wbuf = sbuf->wbuf;
-	nobj = sbuf->nobj;
-
-	ptrbuf = sbuf->ptr.begin;
-	ptrbuf_end = sbuf->ptr.pos;
-	n = ptrbuf_end - sbuf->ptr.begin;
-	sbuf->ptr.pos = sbuf->ptr.begin;
-
-	if(CollectStats) {
-		runtime_xadd64(&gcstats.ptr.sum, n);
-		runtime_xadd64(&gcstats.ptr.cnt, 1);
-	}
-
-	// If buffer is nearly full, get a new one.
-	if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
-		if(wbuf != nil)
-			wbuf->nobj = nobj;
-		wbuf = getempty(wbuf);
-		wp = wbuf->obj;
-		nobj = 0;
-
-		if(n >= nelem(wbuf->obj))
-			runtime_throw("ptrbuf has to be smaller than WorkBuf");
-	}
-
-	while(ptrbuf < ptrbuf_end) {
-		obj = ptrbuf->p;
-		ti = ptrbuf->ti;
-		ptrbuf++;
-
-		// obj belongs to interval [mheap.arena_start, mheap.arena_used).
-		if(Debug > 1) {
-			if(obj < runtime_mheap.arena_start || obj >= runtime_mheap.arena_used)
-				runtime_throw("object is outside of mheap");
-		}
-
-		// obj may be a pointer to a live object.
-		// Try to find the beginning of the object.
-
-		// Round down to word boundary.
-		if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
-			obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
-			ti = 0;
-		}
-
-		// Find bits for this word.
-		off = (uintptr*)obj - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		xbits = *bitp;
-		bits = xbits >> shift;
-
-		// Pointing at the beginning of a block?
-		if((bits & (bitAllocated|bitBlockBoundary)) != 0) {
-			if(CollectStats)
-				runtime_xadd64(&gcstats.flushptrbuf.foundbit, 1);
-			goto found;
-		}
-
-		ti = 0;
-
-		// Pointing just past the beginning?
-		// Scan backward a little to find a block boundary.
-		for(j=shift; j-->0; ) {
-			if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
-				obj = (byte*)obj - (shift-j)*PtrSize;
-				shift = j;
-				bits = xbits>>shift;
-				if(CollectStats)
-					runtime_xadd64(&gcstats.flushptrbuf.foundword, 1);
-				goto found;
-			}
-		}
-
-		// Otherwise consult span table to find beginning.
-		// (Manually inlined copy of MHeap_LookupMaybe.)
-		k = (uintptr)obj>>PageShift;
-		x = k;
-		x -= (uintptr)arena_start>>PageShift;
-		s = runtime_mheap.spans[x];
-		if(s == nil || k < s->start || (uintptr)obj >= s->limit || s->state != MSpanInUse)
-			continue;
-		p = (byte*)((uintptr)s->start<<PageShift);
-		if(s->sizeclass == 0) {
-			obj = p;
-		} else {
-			size = s->elemsize;
-			int32 i = ((byte*)obj - p)/size;
-			obj = p+i*size;
-		}
-
-		// Now that we know the object header, reload bits.
-		off = (uintptr*)obj - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		xbits = *bitp;
-		bits = xbits >> shift;
-		if(CollectStats)
-			runtime_xadd64(&gcstats.flushptrbuf.foundspan, 1);
-
-	found:
-		// Now we have bits, bitp, and shift correct for
-		// obj pointing at the base of the object.
-		// Only care about allocated and not marked.
-		if((bits & (bitAllocated|bitMarked)) != bitAllocated)
-			continue;
-		if(work.nproc == 1)
-			*bitp |= bitMarked<<shift;
-		else {
-			for(;;) {
-				x = *bitp;
-				if(x & (bitMarked<<shift))
-					goto continue_obj;
-				if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
-					break;
-			}
-		}
-
-		// If object has no pointers, don't need to scan further.
-		if((bits & bitScan) == 0)
-			continue;
-
-		// Ask span about size class.
-		// (Manually inlined copy of MHeap_Lookup.)
-		x = (uintptr)obj >> PageShift;
-		x -= (uintptr)arena_start>>PageShift;
-		s = runtime_mheap.spans[x];
-
-		PREFETCH(obj);
-
-		*wp = (Obj){obj, s->elemsize, ti};
-		wp++;
-		nobj++;
-	continue_obj:;
-	}
-
-	// If another proc wants a pointer, give it some.
-	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
-		wbuf->nobj = nobj;
-		wbuf = handoff(wbuf);
-		nobj = wbuf->nobj;
-		wp = wbuf->obj + nobj;
-	}
-
-	sbuf->wp = wp;
-	sbuf->wbuf = wbuf;
-	sbuf->nobj = nobj;
-}
-
-static void
-flushobjbuf(Scanbuf *sbuf)
-{
-	uintptr nobj, off;
-	Obj *wp, obj;
-	Workbuf *wbuf;
-	Obj *objbuf;
-	Obj *objbuf_end;
-
-	wp = sbuf->wp;
-	wbuf = sbuf->wbuf;
-	nobj = sbuf->nobj;
-
-	objbuf = sbuf->obj.begin;
-	objbuf_end = sbuf->obj.pos;
-	sbuf->obj.pos = sbuf->obj.begin;
-
-	while(objbuf < objbuf_end) {
-		obj = *objbuf++;
-
-		// Align obj.b to a word boundary.
-		off = (uintptr)obj.p & (PtrSize-1);
-		if(off != 0) {
-			obj.p += PtrSize - off;
-			obj.n -= PtrSize - off;
-			obj.ti = 0;
-		}
-
-		if(obj.p == nil || obj.n == 0)
-			continue;
-
-		// If buffer is full, get a new one.
-		if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
-			if(wbuf != nil)
-				wbuf->nobj = nobj;
-			wbuf = getempty(wbuf);
-			wp = wbuf->obj;
-			nobj = 0;
-		}
-
-		*wp = obj;
-		wp++;
-		nobj++;
-	}
-
-	// If another proc wants a pointer, give it some.
-	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
-		wbuf->nobj = nobj;
-		wbuf = handoff(wbuf);
-		nobj = wbuf->nobj;
-		wp = wbuf->obj + nobj;
-	}
-
-	sbuf->wp = wp;
-	sbuf->wbuf = wbuf;
-	sbuf->nobj = nobj;
-}
-
-// Program that scans the whole block and treats every block element as a potential pointer
-static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
-
-// Hchan program
-static uintptr chanProg[2] = {0, GC_CHAN};
-
-// Local variables of a program fragment or loop
-typedef struct GCFrame GCFrame;
-struct GCFrame {
-	uintptr count, elemsize, b;
-	const uintptr *loop_or_ret;
-};
-
-// Sanity check for the derived type info objti.
-static void
-checkptr(void *obj, uintptr objti)
-{
-	uintptr *pc1, type, tisize, i, j, x;
-	const uintptr *pc2;
-	byte *objstart;
-	Type *t;
-	MSpan *s;
-
-	if(!Debug)
-		runtime_throw("checkptr is debug only");
-
-	if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
-		return;
-	type = runtime_gettype(obj);
-	t = (Type*)(type & ~(uintptr)(PtrSize-1));
-	if(t == nil)
-		return;
-	x = (uintptr)obj >> PageShift;
-	x -= (uintptr)(runtime_mheap.arena_start)>>PageShift;
-	s = runtime_mheap.spans[x];
-	objstart = (byte*)((uintptr)s->start<<PageShift);
-	if(s->sizeclass != 0) {
-		i = ((byte*)obj - objstart)/s->elemsize;
-		objstart += i*s->elemsize;
-	}
-	tisize = *(uintptr*)objti;
-	// Sanity check for object size: it should fit into the memory block.
-	if((byte*)obj + tisize > objstart + s->elemsize) {
-		runtime_printf("object of type '%S' at %p/%p does not fit in block %p/%p\n",
-			       *t->string, obj, tisize, objstart, s->elemsize);
-		runtime_throw("invalid gc type info");
-	}
-	if(obj != objstart)
-		return;
-	// If obj points to the beginning of the memory block,
-	// check type info as well.
-	if(t->string == nil ||
-		// Gob allocates unsafe pointers for indirection.
-		(runtime_strcmp((const char *)t->string->str, (const char*)"unsafe.Pointer") &&
-		// Runtime and gc think differently about closures.
-		 runtime_strstr((const char *)t->string->str, (const char*)"struct { F uintptr") != (const char *)t->string->str)) {
-		pc1 = (uintptr*)objti;
-		pc2 = (const uintptr*)t->__gcdata;
-		// A simple best-effort check until first GC_END.
-		for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
-			if(pc1[j] != pc2[j]) {
-				runtime_printf("invalid gc type info for '%s', type info %p [%d]=%p, block info %p [%d]=%p\n",
-					       t->string ? (const int8*)t->string->str : (const int8*)"?", pc1, (int32)j, pc1[j], pc2, (int32)j, pc2[j]);
-				runtime_throw("invalid gc type info");
-			}
-		}
-	}
-}					
-
-// scanblock scans a block of n bytes starting at pointer b for references
-// to other objects, scanning any it finds recursively until there are no
-// unscanned objects left.  Instead of using an explicit recursion, it keeps
-// a work list in the Workbuf* structures and loops in the main function
-// body.  Keeping an explicit work list is easier on the stack allocator and
-// more efficient.
-static void
-scanblock(Workbuf *wbuf, bool keepworking)
-{
-	byte *b, *arena_start, *arena_used;
-	uintptr n, i, end_b, elemsize, size, ti, objti, count, type, nobj;
-	uintptr precise_type, nominal_size;
-	const uintptr *pc, *chan_ret;
-	uintptr chancap;
-	void *obj;
-	const Type *t, *et;
-	Slice *sliceptr;
-	String *stringptr;
-	GCFrame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
-	BufferList *scanbuffers;
-	Scanbuf sbuf;
-	Eface *eface;
-	Iface *iface;
-	Hchan *chan;
-	const ChanType *chantype;
-	Obj *wp;
-
-	if(sizeof(Workbuf) % WorkbufSize != 0)
-		runtime_throw("scanblock: size of Workbuf is suboptimal");
-
-	// Memory arena parameters.
-	arena_start = runtime_mheap.arena_start;
-	arena_used = runtime_mheap.arena_used;
-
-	stack_ptr = stack+nelem(stack)-1;
-
-	precise_type = false;
-	nominal_size = 0;
-
-	if(wbuf) {
-		nobj = wbuf->nobj;
-		wp = &wbuf->obj[nobj];
-	} else {
-		nobj = 0;
-		wp = nil;
-	}
-
-	// Initialize sbuf
-	scanbuffers = &bufferList[runtime_m()->helpgc];
-
-	sbuf.ptr.begin = sbuf.ptr.pos = &scanbuffers->ptrtarget[0];
-	sbuf.ptr.end = sbuf.ptr.begin + nelem(scanbuffers->ptrtarget);
-
-	sbuf.obj.begin = sbuf.obj.pos = &scanbuffers->obj[0];
-	sbuf.obj.end = sbuf.obj.begin + nelem(scanbuffers->obj);
-
-	sbuf.wbuf = wbuf;
-	sbuf.wp = wp;
-	sbuf.nobj = nobj;
-
-	// (Silence the compiler)
-	chan = nil;
-	chantype = nil;
-	chan_ret = nil;
-
-	goto next_block;
-
-	for(;;) {
-		// Each iteration scans the block b of length n, queueing pointers in
-		// the work buffer.
-
-		if(CollectStats) {
-			runtime_xadd64(&gcstats.nbytes, n);
-			runtime_xadd64(&gcstats.obj.sum, sbuf.nobj);
-			runtime_xadd64(&gcstats.obj.cnt, 1);
-		}
-
-		if(ti != 0) {
-			if(Debug > 1) {
-				runtime_printf("scanblock %p %D ti %p\n", b, (int64)n, ti);
-			}
-			pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
-			precise_type = (ti & PRECISE);
-			stack_top.elemsize = pc[0];
-			if(!precise_type)
-				nominal_size = pc[0];
-			if(ti & LOOP) {
-				stack_top.count = 0;	// 0 means an infinite number of iterations
-				stack_top.loop_or_ret = pc+1;
-			} else {
-				stack_top.count = 1;
-			}
-			if(Debug) {
-				// Simple sanity check for provided type info ti:
-				// The declared size of the object must be not larger than the actual size
-				// (it can be smaller due to inferior pointers).
-				// It's difficult to make a comprehensive check due to inferior pointers,
-				// reflection, gob, etc.
-				if(pc[0] > n) {
-					runtime_printf("invalid gc type info: type info size %p, block size %p\n", pc[0], n);
-					runtime_throw("invalid gc type info");
-				}
-			}
-		} else if(UseSpanType) {
-			if(CollectStats)
-				runtime_xadd64(&gcstats.obj.notype, 1);
-
-			type = runtime_gettype(b);
-			if(type != 0) {
-				if(CollectStats)
-					runtime_xadd64(&gcstats.obj.typelookup, 1);
-
-				t = (Type*)(type & ~(uintptr)(PtrSize-1));
-				switch(type & (PtrSize-1)) {
-				case TypeInfo_SingleObject:
-					pc = (const uintptr*)t->__gcdata;
-					precise_type = true;  // type information about 'b' is precise
-					stack_top.count = 1;
-					stack_top.elemsize = pc[0];
-					break;
-				case TypeInfo_Array:
-					pc = (const uintptr*)t->__gcdata;
-					if(pc[0] == 0)
-						goto next_block;
-					precise_type = true;  // type information about 'b' is precise
-					stack_top.count = 0;  // 0 means an infinite number of iterations
-					stack_top.elemsize = pc[0];
-					stack_top.loop_or_ret = pc+1;
-					break;
-				case TypeInfo_Chan:
-					chan = (Hchan*)b;
-					chantype = (const ChanType*)t;
-					chan_ret = nil;
-					pc = chanProg;
-					break;
-				default:
-					if(Debug > 1)
-						runtime_printf("scanblock %p %D type %p %S\n", b, (int64)n, type, *t->string);
-					runtime_throw("scanblock: invalid type");
-					return;
-				}
-				if(Debug > 1)
-					runtime_printf("scanblock %p %D type %p %S pc=%p\n", b, (int64)n, type, *t->string, pc);
-			} else {
-				pc = defaultProg;
-				if(Debug > 1)
-					runtime_printf("scanblock %p %D unknown type\n", b, (int64)n);
-			}
-		} else {
-			pc = defaultProg;
-			if(Debug > 1)
-				runtime_printf("scanblock %p %D no span types\n", b, (int64)n);
-		}
-
-		if(IgnorePreciseGC)
-			pc = defaultProg;
-
-		pc++;
-		stack_top.b = (uintptr)b;
-		end_b = (uintptr)b + n - PtrSize;
-
-	for(;;) {
-		if(CollectStats)
-			runtime_xadd64(&gcstats.instr[pc[0]], 1);
-
-		obj = nil;
-		objti = 0;
-		switch(pc[0]) {
-		case GC_PTR:
-			obj = *(void**)(stack_top.b + pc[1]);
-			objti = pc[2];
-			if(Debug > 2)
-				runtime_printf("gc_ptr @%p: %p ti=%p\n", stack_top.b+pc[1], obj, objti);
-			pc += 3;
-			if(Debug)
-				checkptr(obj, objti);
-			break;
-
-		case GC_SLICE:
-			sliceptr = (Slice*)(stack_top.b + pc[1]);
-			if(Debug > 2)
-				runtime_printf("gc_slice @%p: %p/%D/%D\n", sliceptr, sliceptr->array, (int64)sliceptr->__count, (int64)sliceptr->cap);
-			if(sliceptr->cap != 0) {
-				obj = sliceptr->array;
-				// Can't use slice element type for scanning,
-				// because if it points to an array embedded
-				// in the beginning of a struct,
-				// we will scan the whole struct as the slice.
-				// So just obtain type info from heap.
-			}
-			pc += 3;
-			break;
-
-		case GC_APTR:
-			obj = *(void**)(stack_top.b + pc[1]);
-			if(Debug > 2)
-				runtime_printf("gc_aptr @%p: %p\n", stack_top.b+pc[1], obj);
-			pc += 2;
-			break;
-
-		case GC_STRING:
-			stringptr = (String*)(stack_top.b + pc[1]);
-			if(Debug > 2)
-				runtime_printf("gc_string @%p: %p/%D\n", stack_top.b+pc[1], stringptr->str, (int64)stringptr->len);
-			if(stringptr->len != 0)
-				markonly(stringptr->str);
-			pc += 2;
-			continue;
-
-		case GC_EFACE:
-			eface = (Eface*)(stack_top.b + pc[1]);
-			pc += 2;
-			if(Debug > 2)
-				runtime_printf("gc_eface @%p: %p %p\n", stack_top.b+pc[1], eface->_type, eface->data);
-			if(eface->_type == nil)
-				continue;
-
-			// eface->type
-			t = eface->_type;
-			if((const byte*)t >= arena_start && (const byte*)t < arena_used) {
-				union { const Type *tc; Type *tr; } u;
-				u.tc = t;
-				*sbuf.ptr.pos++ = (PtrTarget){u.tr, 0};
-				if(sbuf.ptr.pos == sbuf.ptr.end)
-					flushptrbuf(&sbuf);
-			}
-
-			// eface->data
-			if((byte*)eface->data >= arena_start && (byte*)eface->data < arena_used) {
-				if(__go_is_pointer_type(t)) {
-					if((t->__code & kindNoPointers))
-						continue;
-
-					obj = eface->data;
-					if((t->__code & kindMask) == kindPtr) {
-						// Only use type information if it is a pointer-containing type.
-						// This matches the GC programs written by cmd/gc/reflect.c's
-						// dgcsym1 in case TPTR32/case TPTR64. See rationale there.
-						et = ((const PtrType*)t)->elem;
-						if(!(et->__code & kindNoPointers))
-							objti = (uintptr)((const PtrType*)t)->elem->__gcdata;
-					}
-				} else {
-					obj = eface->data;
-					objti = (uintptr)t->__gcdata;
-				}
-			}
-			break;
-
-		case GC_IFACE:
-			iface = (Iface*)(stack_top.b + pc[1]);
-			pc += 2;
-			if(Debug > 2)
-				runtime_printf("gc_iface @%p: %p/%p %p\n", stack_top.b+pc[1], *(Type**)iface->tab, nil, iface->data);
-			if(iface->tab == nil)
-				continue;
-			
-			// iface->tab
-			if((byte*)iface->tab >= arena_start && (byte*)iface->tab < arena_used) {
-				*sbuf.ptr.pos++ = (PtrTarget){iface->tab, 0};
-				if(sbuf.ptr.pos == sbuf.ptr.end)
-					flushptrbuf(&sbuf);
-			}
-
-			// iface->data
-			if((byte*)iface->data >= arena_start && (byte*)iface->data < arena_used) {
-				t = *(Type**)iface->tab;
-				if(__go_is_pointer_type(t)) {
-					if((t->__code & kindNoPointers))
-						continue;
-
-					obj = iface->data;
-					if((t->__code & kindMask) == kindPtr) {
-						// Only use type information if it is a pointer-containing type.
-						// This matches the GC programs written by cmd/gc/reflect.c's
-						// dgcsym1 in case TPTR32/case TPTR64. See rationale there.
-						et = ((const PtrType*)t)->elem;
-						if(!(et->__code & kindNoPointers))
-							objti = (uintptr)((const PtrType*)t)->elem->__gcdata;
-					}
-				} else {
-					obj = iface->data;
-					objti = (uintptr)t->__gcdata;
-				}
-			}
-			break;
-
-		case GC_DEFAULT_PTR:
-			while(stack_top.b <= end_b) {
-				obj = *(byte**)stack_top.b;
-				if(Debug > 2)
-					runtime_printf("gc_default_ptr @%p: %p\n", stack_top.b, obj);
-				stack_top.b += PtrSize;
-				if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
-					*sbuf.ptr.pos++ = (PtrTarget){obj, 0};
-					if(sbuf.ptr.pos == sbuf.ptr.end)
-						flushptrbuf(&sbuf);
-				}
-			}
-			goto next_block;
-
-		case GC_END:
-			if(--stack_top.count != 0) {
-				// Next iteration of a loop if possible.
-				stack_top.b += stack_top.elemsize;
-				if(stack_top.b + stack_top.elemsize <= end_b+PtrSize) {
-					pc = stack_top.loop_or_ret;
-					continue;
-				}
-				i = stack_top.b;
-			} else {
-				// Stack pop if possible.
-				if(stack_ptr+1 < stack+nelem(stack)) {
-					pc = stack_top.loop_or_ret;
-					stack_top = *(++stack_ptr);
-					continue;
-				}
-				i = (uintptr)b + nominal_size;
-			}
-			if(!precise_type) {
-				// Quickly scan [b+i,b+n) for possible pointers.
-				for(; i<=end_b; i+=PtrSize) {
-					if(*(byte**)i != nil) {
-						// Found a value that may be a pointer.
-						// Do a rescan of the entire block.
-						enqueue((Obj){b, n, 0}, &sbuf.wbuf, &sbuf.wp, &sbuf.nobj);
-						if(CollectStats) {
-							runtime_xadd64(&gcstats.rescan, 1);
-							runtime_xadd64(&gcstats.rescanbytes, n);
-						}
-						break;
-					}
-				}
-			}
-			goto next_block;
-
-		case GC_ARRAY_START:
-			i = stack_top.b + pc[1];
-			count = pc[2];
-			elemsize = pc[3];
-			pc += 4;
-
-			// Stack push.
-			*stack_ptr-- = stack_top;
-			stack_top = (GCFrame){count, elemsize, i, pc};
-			continue;
-
-		case GC_ARRAY_NEXT:
-			if(--stack_top.count != 0) {
-				stack_top.b += stack_top.elemsize;
-				pc = stack_top.loop_or_ret;
-			} else {
-				// Stack pop.
-				stack_top = *(++stack_ptr);
-				pc += 1;
-			}
-			continue;
-
-		case GC_CALL:
-			// Stack push.
-			*stack_ptr-- = stack_top;
-			stack_top = (GCFrame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
-			pc = (const uintptr*)((const byte*)pc + *(const int32*)(pc+2));  // target of the CALL instruction
-			continue;
-
-		case GC_REGION:
-			obj = (void*)(stack_top.b + pc[1]);
-			size = pc[2];
-			objti = pc[3];
-			pc += 4;
-
-			if(Debug > 2)
-				runtime_printf("gc_region @%p: %D %p\n", stack_top.b+pc[1], (int64)size, objti);
-			*sbuf.obj.pos++ = (Obj){obj, size, objti};
-			if(sbuf.obj.pos == sbuf.obj.end)
-				flushobjbuf(&sbuf);
-			continue;
-
-		case GC_CHAN_PTR:
-			chan = *(Hchan**)(stack_top.b + pc[1]);
-			if(Debug > 2 && chan != nil)
-				runtime_printf("gc_chan_ptr @%p: %p/%D/%D %p\n", stack_top.b+pc[1], chan, (int64)chan->qcount, (int64)chan->dataqsiz, pc[2]);
-			if(chan == nil) {
-				pc += 3;
-				continue;
-			}
-			if(markonly(chan)) {
-				chantype = (ChanType*)pc[2];
-				if(!(chantype->elem->__code & kindNoPointers)) {
-					// Start chanProg.
-					chan_ret = pc+3;
-					pc = chanProg+1;
-					continue;
-				}
-			}
-			pc += 3;
-			continue;
-
-		case GC_CHAN:
-			// There are no heap pointers in struct Hchan,
-			// so we can ignore the leading sizeof(Hchan) bytes.
-			if(!(chantype->elem->__code & kindNoPointers)) {
-				chancap = chan->dataqsiz;
-				if(chancap > 0 && markonly(chan->buf)) {
-					// TODO(atom): split into two chunks so that only the
-					// in-use part of the circular buffer is scanned.
-					// (Channel routines zero the unused part, so the current
-					// code does not lead to leaks, it's just a little inefficient.)
-					*sbuf.obj.pos++ = (Obj){chan->buf, chancap*chantype->elem->__size,
-						(uintptr)chantype->elem->__gcdata | PRECISE | LOOP};
-					if(sbuf.obj.pos == sbuf.obj.end)
-						flushobjbuf(&sbuf);
-				}
-			}
-			if(chan_ret == nil)
-				goto next_block;
-			pc = chan_ret;
-			continue;
-
-		default:
-			runtime_printf("runtime: invalid GC instruction %p at %p\n", pc[0], pc);
-			runtime_throw("scanblock: invalid GC instruction");
-			return;
-		}
-
-		if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
-			*sbuf.ptr.pos++ = (PtrTarget){obj, objti};
-			if(sbuf.ptr.pos == sbuf.ptr.end)
-				flushptrbuf(&sbuf);
-		}
-	}
-
-	next_block:
-		// Done scanning [b, b+n).  Prepare for the next iteration of
-		// the loop by setting b, n, ti to the parameters for the next block.
-
-		if(sbuf.nobj == 0) {
-			flushptrbuf(&sbuf);
-			flushobjbuf(&sbuf);
-
-			if(sbuf.nobj == 0) {
-				if(!keepworking) {
-					if(sbuf.wbuf)
-						putempty(sbuf.wbuf);
-					return;
-				}
-				// Emptied our buffer: refill.
-				sbuf.wbuf = getfull(sbuf.wbuf);
-				if(sbuf.wbuf == nil)
-					return;
-				sbuf.nobj = sbuf.wbuf->nobj;
-				sbuf.wp = sbuf.wbuf->obj + sbuf.wbuf->nobj;
-			}
-		}
-
-		// Fetch b from the work buffer.
-		--sbuf.wp;
-		b = sbuf.wp->p;
-		n = sbuf.wp->n;
-		ti = sbuf.wp->ti;
-		sbuf.nobj--;
-	}
-}
-
-static struct gcRootList* roots;
-
-void
-__go_register_gc_roots (struct gcRootList* r)
-{
-	// FIXME: This needs locking if multiple goroutines can call
-	// dlopen simultaneously.
-	r->next = roots;
-	roots = r;
-}
-
-// Append obj to the work buffer.
-// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
-static void
-enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
-{
-	uintptr nobj, off;
-	Obj *wp;
-	Workbuf *wbuf;
-
-	if(Debug > 1)
-		runtime_printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);
-
-	// Align obj.b to a word boundary.
-	off = (uintptr)obj.p & (PtrSize-1);
-	if(off != 0) {
-		obj.p += PtrSize - off;
-		obj.n -= PtrSize - off;
-		obj.ti = 0;
-	}
-
-	if(obj.p == nil || obj.n == 0)
-		return;
-
-	// Load work buffer state
-	wp = *_wp;
-	wbuf = *_wbuf;
-	nobj = *_nobj;
-
-	// If another proc wants a pointer, give it some.
-	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
-		wbuf->nobj = nobj;
-		wbuf = handoff(wbuf);
-		nobj = wbuf->nobj;
-		wp = wbuf->obj + nobj;
-	}
-
-	// If buffer is full, get a new one.
-	if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
-		if(wbuf != nil)
-			wbuf->nobj = nobj;
-		wbuf = getempty(wbuf);
-		wp = wbuf->obj;
-		nobj = 0;
-	}
-
-	*wp = obj;
-	wp++;
-	nobj++;
-
-	// Save work buffer state
-	*_wp = wp;
-	*_wbuf = wbuf;
-	*_nobj = nobj;
-}
-
-static void
-enqueue1(Workbuf **wbufp, Obj obj)
-{
-	Workbuf *wbuf;
-
-	wbuf = *wbufp;
-	if(wbuf->nobj >= nelem(wbuf->obj))
-		*wbufp = wbuf = getempty(wbuf);
-	wbuf->obj[wbuf->nobj++] = obj;
-}
-
-static void
-markroot(ParFor *desc, uint32 i)
-{
-	Workbuf *wbuf;
-	FinBlock *fb;
-	MHeap *h;
-	MSpan **allspans, *s;
-	uint32 spanidx, sg;
-	G *gp;
-	void *p;
-
-	USED(&desc);
-	wbuf = getempty(nil);
-	// Note: if you add a case here, please also update heapdump.c:dumproots.
-	switch(i) {
-	case RootData:
-		// For gccgo this is both data and bss.
-		{
-			struct gcRootList *pl;
-
-			for(pl = roots; pl != nil; pl = pl->next) {
-				struct gcRoot *pr = &pl->roots[0];
-				intgo count = pl->count;
-				intgo i;
-
-				for (i = 0; i < count; i++) {
-					void *decl = pr->decl;
-					enqueue1(&wbuf, (Obj){decl, pr->ptrdata, 0});
-					pr++;
-				}
-			}
-		}
-		break;
-
-	case RootBss:
-		// For gccgo we use this for all the other global roots.
-		enqueue1(&wbuf, (Obj){(byte*)runtime_m0(), sizeof(M), 0});
-		enqueue1(&wbuf, (Obj){(byte*)runtime_g0(), sizeof(G), 0});
-		enqueue1(&wbuf, (Obj){(byte*)runtime_getAllP(), _MaxGomaxprocs * sizeof(P*), 0});
-		enqueue1(&wbuf, (Obj){(byte*)&work, sizeof work, 0});
-		break;
-
-	case RootFinalizers:
-		for(fb=runtime_getallfin(); fb; fb=fb->alllink)
-			enqueue1(&wbuf, (Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
-		break;
-
-	case RootSpanTypes:
-		// mark span types and MSpan.specials (to walk spans only once)
-		h = &runtime_mheap;
-		sg = h->sweepgen;
-		allspans = h->allspans;
-		for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
-			Special *sp;
-			SpecialFinalizer *spf;
-
-			s = allspans[spanidx];
-			if(s->sweepgen != sg) {
-				runtime_printf("sweep %d %d\n", s->sweepgen, sg);
-				runtime_throw("gc: unswept span");
-			}
-			if(s->state != MSpanInUse)
-				continue;
-			// The garbage collector ignores type pointers stored in MSpan.types:
-			//  - Compiler-generated types are stored outside of heap.
-			//  - The reflect package has runtime-generated types cached in its data structures.
-			//    The garbage collector relies on finding the references via that cache.
-			if(s->types.compression == MTypes_Words || s->types.compression == MTypes_Bytes)
-				markonly((byte*)s->types.data);
-			for(sp = s->specials; sp != nil; sp = sp->next) {
-				if(sp->kind != KindSpecialFinalizer)
-					continue;
-				// don't mark finalized object, but scan it so we
-				// retain everything it points to.
-				spf = (SpecialFinalizer*)sp;
-				// A finalizer can be set for an inner byte of an object, find object beginning.
-				p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
-				enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
-				enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
-				enqueue1(&wbuf, (Obj){(void*)&spf->ft, PtrSize, 0});
-				enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
-			}
-		}
-		break;
-
-	case RootFlushCaches:
-		flushallmcaches();
-		break;
-
-	default:
-		// the rest is scanning goroutine stacks
-		if(i - RootCount >= runtime_getallglen())
-			runtime_throw("markroot: bad index");
-		gp = runtime_getallg(i - RootCount);
-		// remember when we've first observed the G blocked
-		// needed only to output in traceback
-		if((gp->atomicstatus == _Gwaiting || gp->atomicstatus == _Gsyscall) && gp->waitsince == 0)
-			gp->waitsince = work.tstart;
-		addstackroots(gp, &wbuf);
-		break;
-		
-	}
-
-	if(wbuf)
-		scanblock(wbuf, false);
-}
-
-static const FuncVal markroot_funcval = { (void *) markroot };
-
-// Get an empty work buffer off the work.empty list,
-// allocating new buffers as needed.
-static Workbuf*
-getempty(Workbuf *b)
-{
-	if(b != nil)
-		runtime_lfstackpush(&work.full, &b->node);
-	b = (Workbuf*)runtime_lfstackpop(&work.wempty);
-	if(b == nil) {
-		// Need to allocate.
-		runtime_lock(&work);
-		if(work.nchunk < sizeof *b) {
-			work.nchunk = 1<<20;
-			work.chunk = runtime_SysAlloc(work.nchunk, &mstats()->gc_sys);
-			if(work.chunk == nil)
-				runtime_throw("runtime: cannot allocate memory");
-		}
-		b = (Workbuf*)work.chunk;
-		work.chunk += sizeof *b;
-		work.nchunk -= sizeof *b;
-		runtime_unlock(&work);
-	}
-	b->nobj = 0;
-	return b;
-}
-
-static void
-putempty(Workbuf *b)
-{
-	if(CollectStats)
-		runtime_xadd64(&gcstats.putempty, 1);
-
-	runtime_lfstackpush(&work.wempty, &b->node);
-}
-
-// Get a full work buffer off the work.full list, or return nil.
-static Workbuf*
-getfull(Workbuf *b)
-{
-	M *m;
-	int32 i;
-
-	if(CollectStats)
-		runtime_xadd64(&gcstats.getfull, 1);
-
-	if(b != nil)
-		runtime_lfstackpush(&work.wempty, &b->node);
-	b = (Workbuf*)runtime_lfstackpop(&work.full);
-	if(b != nil || work.nproc == 1)
-		return b;
-
-	m = runtime_m();
-	runtime_xadd(&work.nwait, +1);
-	for(i=0;; i++) {
-		if(work.full != 0) {
-			runtime_xadd(&work.nwait, -1);
-			b = (Workbuf*)runtime_lfstackpop(&work.full);
-			if(b != nil)
-				return b;
-			runtime_xadd(&work.nwait, +1);
-		}
-		if(work.nwait == work.nproc)
-			return nil;
-		if(i < 10) {
-			m->gcstats.nprocyield++;
-			runtime_procyield(20);
-		} else if(i < 20) {
-			m->gcstats.nosyield++;
-			runtime_osyield();
-		} else {
-			m->gcstats.nsleep++;
-			runtime_usleep(100);
-		}
-	}
-}
-
-static Workbuf*
-handoff(Workbuf *b)
-{
-	M *m;
-	int32 n;
-	Workbuf *b1;
-
-	m = runtime_m();
-
-	// Make new buffer with half of b's pointers.
-	b1 = getempty(nil);
-	n = b->nobj/2;
-	b->nobj -= n;
-	b1->nobj = n;
-	runtime_memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-	m->gcstats.nhandoff++;
-	m->gcstats.nhandoffcnt += n;
-
-	// Put b on full list - let first half of b get stolen.
-	runtime_lfstackpush(&work.full, &b->node);
-	return b1;
-}
-
-static void
-addstackroots(G *gp, Workbuf **wbufp)
-{
-	switch(gp->atomicstatus){
-	default:
-		runtime_printf("unexpected G.status %d (goroutine %p %D)\n", gp->atomicstatus, gp, gp->goid);
-		runtime_throw("mark - bad status");
-	case _Gdead:
-		return;
-	case _Grunning:
-		runtime_throw("mark - world not stopped");
-	case _Grunnable:
-	case _Gsyscall:
-	case _Gwaiting:
-		break;
-	}
-
-	// Explicitly scan the saved contexts.
-	// We have to pass defaultProg to prevent scanblock from looking
-	// up the pointer to get the type.
-	enqueue1(wbufp, (Obj){(byte*)(&gp->gcregs[0]), sizeof(gp->gcregs), (uintptr)(&defaultProg[0])});
-	enqueue1(wbufp, (Obj){(byte*)(&gp->context[0]), sizeof(gp->context), (uintptr)(&defaultProg[0])});
-
-#ifdef USING_SPLIT_STACK
-	M *mp;
-	void* sp;
-	size_t spsize;
-	void* next_segment;
-	void* next_sp;
-	void* initial_sp;
-
-	if(gp == runtime_g()) {
-		// Scanning our own stack.
-		sp = __splitstack_find(nil, nil, &spsize, &next_segment,
-				       &next_sp, &initial_sp);
-	} else if((mp = gp->m) != nil && mp->helpgc) {
-		// gchelper's stack is in active use and has no interesting pointers.
-		return;
-	} else {
-		// Scanning another goroutine's stack.
-		// The goroutine is usually asleep (the world is stopped).
-
-		// The exception is that if the goroutine is about to enter or might
-		// have just exited a system call, it may be executing code such
-		// as schedlock and may have needed to start a new stack segment.
-		// Use the stack segment and stack pointer at the time of
-		// the system call instead, since that won't change underfoot.
-		if(gp->gcstack != nil) {
-			sp = gp->gcstack;
-			spsize = gp->gcstacksize;
-			next_segment = gp->gcnextsegment;
-			next_sp = gp->gcnextsp;
-			initial_sp = gp->gcinitialsp;
-		} else {
-			sp = __splitstack_find_context((void*)(&gp->stackcontext[0]),
-						       &spsize, &next_segment,
-						       &next_sp, &initial_sp);
-		}
-	}
-	if(sp != nil) {
-		enqueue1(wbufp, (Obj){sp, spsize, 0});
-		while((sp = __splitstack_find(next_segment, next_sp,
-					      &spsize, &next_segment,
-					      &next_sp, &initial_sp)) != nil)
-			enqueue1(wbufp, (Obj){sp, spsize, 0});
-	}
-#else
-	M *mp;
-	byte* bottom;
-	byte* top;
-
-	if(gp == runtime_g()) {
-		// Scanning our own stack.
-		bottom = (byte*)&gp;
-	} else if((mp = gp->m) != nil && mp->helpgc) {
-		// gchelper's stack is in active use and has no interesting pointers.
-		return;
-	} else {
-		// Scanning another goroutine's stack.
-		// The goroutine is usually asleep (the world is stopped).
-		bottom = (byte*)gp->gcnextsp;
-		if(bottom == nil)
-			return;
-	}
-	top = (byte*)gp->gcinitialsp + gp->gcstacksize;
-	if(top > bottom)
-		enqueue1(wbufp, (Obj){bottom, top - bottom, 0});
-	else
-		enqueue1(wbufp, (Obj){top, bottom - top, 0});
-#endif
-}
-
-void
-runtime_MSpan_EnsureSwept(MSpan *s)
-{
-	M *m = runtime_m();
-	G *g = runtime_g();
-	uint32 sg;
-
-	// Caller must disable preemption.
-	// Otherwise when this function returns the span can become unswept again
-	// (if GC is triggered on another goroutine).
-	if(m->locks == 0 && m->mallocing == 0 && g != m->g0)
-		runtime_throw("MSpan_EnsureSwept: m is not locked");
-
-	sg = runtime_mheap.sweepgen;
-	if(runtime_atomicload(&s->sweepgen) == sg)
-		return;
-	if(runtime_cas(&s->sweepgen, sg-2, sg-1)) {
-		runtime_MSpan_Sweep(s);
-		return;
-	}
-	// unfortunate condition, and we don't have efficient means to wait
-	while(runtime_atomicload(&s->sweepgen) != sg)
-		runtime_osyield();  
-}
-
-// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-// It clears the mark bits in preparation for the next GC round.
-// Returns true if the span was returned to heap.
-bool
-runtime_MSpan_Sweep(MSpan *s)
-{
-	M *m;
-	int32 cl, n, npages, nfree;
-	uintptr size, off, *bitp, shift, bits;
-	uint32 sweepgen;
-	byte *p;
-	MCache *c;
-	byte *arena_start;
-	MLink head, *end;
-	byte *type_data;
-	byte compression;
-	uintptr type_data_inc;
-	MLink *x;
-	Special *special, **specialp, *y;
-	bool res, sweepgenset;
-
-	m = runtime_m();
-
-	// It's critical that we enter this function with preemption disabled,
-	// GC must not start while we are in the middle of this function.
-	if(m->locks == 0 && m->mallocing == 0 && runtime_g() != m->g0)
-		runtime_throw("MSpan_Sweep: m is not locked");
-	sweepgen = runtime_mheap.sweepgen;
-	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
-		runtime_printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
-			s->state, s->sweepgen, sweepgen);
-		runtime_throw("MSpan_Sweep: bad span state");
-	}
-	arena_start = runtime_mheap.arena_start;
-	cl = s->sizeclass;
-	size = s->elemsize;
-	if(cl == 0) {
-		n = 1;
-	} else {
-		// Chunk full of small blocks.
-		npages = runtime_class_to_allocnpages[cl];
-		n = (npages << PageShift) / size;
-	}
-	res = false;
-	nfree = 0;
-	end = &head;
-	c = m->mcache;
-	sweepgenset = false;
-
-	// mark any free objects in this span so we don't collect them
-	for(x = s->freelist; x != nil; x = x->next) {
-		// This is markonly(x) but faster because we don't need
-		// atomic access and we're guaranteed to be pointing at
-		// the head of a valid object.
-		off = (uintptr*)x - (uintptr*)runtime_mheap.arena_start;
-		bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		*bitp |= bitMarked<<shift;
-	}
-
-	// Unlink & free special records for any objects we're about to free.
-	specialp = &s->specials;
-	special = *specialp;
-	while(special != nil) {
-		// A finalizer can be set for an inner byte of an object, find object beginning.
-		p = (byte*)(s->start << PageShift) + special->offset/size*size;
-		off = (uintptr*)p - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		bits = *bitp>>shift;
-		if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
-			// Find the exact byte for which the special was setup
-			// (as opposed to object beginning).
-			p = (byte*)(s->start << PageShift) + special->offset;
-			// about to free object: splice out special record
-			y = special;
-			special = special->next;
-			*specialp = special;
-			if(!runtime_freespecial(y, p, size, false)) {
-				// stop freeing of object if it has a finalizer
-				*bitp |= bitMarked << shift;
-			}
-		} else {
-			// object is still live: keep special record
-			specialp = &special->next;
-			special = *specialp;
-		}
-	}
-
-	type_data = (byte*)s->types.data;
-	type_data_inc = sizeof(uintptr);
-	compression = s->types.compression;
-	switch(compression) {
-	case MTypes_Bytes:
-		type_data += 8*sizeof(uintptr);
-		type_data_inc = 1;
-		break;
-	}
-
-	// Sweep through n objects of given size starting at p.
-	// This thread owns the span now, so it can manipulate
-	// the block bitmap without atomic operations.
-	p = (byte*)(s->start << PageShift);
-	for(; n > 0; n--, p += size, type_data+=type_data_inc) {
-		off = (uintptr*)p - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		bits = *bitp>>shift;
-
-		if((bits & bitAllocated) == 0)
-			continue;
-
-		if((bits & bitMarked) != 0) {
-			*bitp &= ~(bitMarked<<shift);
-			continue;
-		}
-
-		if(runtime_debug.allocfreetrace)
-			runtime_tracefree(p, size);
-
-		// Clear mark and scan bits.
-		*bitp &= ~((bitScan|bitMarked)<<shift);
-
-		if(cl == 0) {
-			// Free large span.
-			runtime_unmarkspan(p, 1<<PageShift);
-			s->needzero = 1;
-			// important to set sweepgen before returning it to heap
-			runtime_atomicstore(&s->sweepgen, sweepgen);
-			sweepgenset = true;
-			// See note about SysFault vs SysFree in malloc.goc.
-			if(runtime_debug.efence)
-				runtime_SysFault(p, size);
-			else
-				runtime_MHeap_Free(&runtime_mheap, s, 1);
-			c->local_nlargefree++;
-			c->local_largefree += size;
-			runtime_xadd64(&mstats()->next_gc, -(uint64)(size * (gcpercent + 100)/100));
-			res = true;
-		} else {
-			// Free small object.
-			switch(compression) {
-			case MTypes_Words:
-				*(uintptr*)type_data = 0;
-				break;
-			case MTypes_Bytes:
-				*(byte*)type_data = 0;
-				break;
-			}
-			if(size > 2*sizeof(uintptr))
-				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
-			else if(size > sizeof(uintptr))
-				((uintptr*)p)[1] = 0;
-
-			end->next = (MLink*)p;
-			end = (MLink*)p;
-			nfree++;
-		}
-	}
-
-	// We need to set s->sweepgen = h->sweepgen only when all blocks are swept,
-	// because of the potential for a concurrent free/SetFinalizer.
-	// But we need to set it before we make the span available for allocation
-	// (return it to heap or mcentral), because allocation code assumes that a
-	// span is already swept if available for allocation.
-
-	if(!sweepgenset && nfree == 0) {
-		// The span must be in our exclusive ownership until we update sweepgen,
-		// check for potential races.
-		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
-			runtime_printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
-				s->state, s->sweepgen, sweepgen);
-			runtime_throw("MSpan_Sweep: bad span state after sweep");
-		}
-		runtime_atomicstore(&s->sweepgen, sweepgen);
-	}
-	if(nfree > 0) {
-		c->local_nsmallfree[cl] += nfree;
-		c->local_cachealloc -= nfree * size;
-		runtime_xadd64(&mstats()->next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
-		res = runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end);
-		//MCentral_FreeSpan updates sweepgen
-	}
-	return res;
-}
-
-// State of background sweep.
-// Protected by gclock.
-static struct
-{
-	G*	g;
-	bool	parked;
-
-	MSpan**	spans;
-	uint32	nspan;
-	uint32	spanidx;
-} sweep;
-
-// background sweeping goroutine
-static void
-bgsweep(void* dummy __attribute__ ((unused)))
-{
-	runtime_g()->issystem = 1;
-	for(;;) {
-		while(runtime_sweepone() != (uintptr)-1) {
-			gcstats.nbgsweep++;
-			runtime_gosched();
-		}
-		runtime_lock(&gclock);
-		if(!runtime_mheap.sweepdone) {
-			// It's possible if GC has happened between sweepone has
-			// returned -1 and gclock lock.
-			runtime_unlock(&gclock);
-			continue;
-		}
-		sweep.parked = true;
-		runtime_g()->isbackground = true;
-		runtime_goparkunlock(&gclock, runtime_gostringnocopy((const byte*)"GC sweep wait"), traceEvGoBlock, 1);
-		runtime_g()->isbackground = false;
-	}
-}
-
-// sweeps one span
-// returns number of pages returned to heap, or -1 if there is nothing to sweep
-uintptr
-runtime_sweepone(void)
-{
-	M *m = runtime_m();
-	MSpan *s;
-	uint32 idx, sg;
-	uintptr npages;
-
-	// increment locks to ensure that the goroutine is not preempted
-	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
-	m->locks++;
-	sg = runtime_mheap.sweepgen;
-	for(;;) {
-		idx = runtime_xadd(&sweep.spanidx, 1) - 1;
-		if(idx >= sweep.nspan) {
-			runtime_mheap.sweepdone = true;
-			m->locks--;
-			return (uintptr)-1;
-		}
-		s = sweep.spans[idx];
-		if(s->state != MSpanInUse) {
-			s->sweepgen = sg;
-			continue;
-		}
-		if(s->sweepgen != sg-2 || !runtime_cas(&s->sweepgen, sg-2, sg-1))
-			continue;
-		if(s->incache)
-			runtime_throw("sweep of incache span");
-		npages = s->npages;
-		if(!runtime_MSpan_Sweep(s))
-			npages = 0;
-		m->locks--;
-		return npages;
-	}
-}
-
-static void
-dumpspan(uint32 idx)
-{
-	int32 sizeclass, n, npages, i, column;
-	uintptr size;
-	byte *p;
-	byte *arena_start;
-	MSpan *s;
-	bool allocated;
-
-	s = runtime_mheap.allspans[idx];
-	if(s->state != MSpanInUse)
-		return;
-	arena_start = runtime_mheap.arena_start;
-	p = (byte*)(s->start << PageShift);
-	sizeclass = s->sizeclass;
-	size = s->elemsize;
-	if(sizeclass == 0) {
-		n = 1;
-	} else {
-		npages = runtime_class_to_allocnpages[sizeclass];
-		n = (npages << PageShift) / size;
-	}
-	
-	runtime_printf("%p .. %p:\n", p, p+n*size);
-	column = 0;
-	for(; n>0; n--, p+=size) {
-		uintptr off, *bitp, shift, bits;
-
-		off = (uintptr*)p - (uintptr*)arena_start;
-		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		bits = *bitp>>shift;
-
-		allocated = ((bits & bitAllocated) != 0);
-
-		for(i=0; (uint32)i<size; i+=sizeof(void*)) {
-			if(column == 0) {
-				runtime_printf("\t");
-			}
-			if(i == 0) {
-				runtime_printf(allocated ? "(" : "[");
-				runtime_printf("%p: ", p+i);
-			} else {
-				runtime_printf(" ");
-			}
-
-			runtime_printf("%p", *(void**)(p+i));
-
-			if(i+sizeof(void*) >= size) {
-				runtime_printf(allocated ? ") " : "] ");
-			}
-
-			column++;
-			if(column == 8) {
-				runtime_printf("\n");
-				column = 0;
-			}
-		}
-	}
-	runtime_printf("\n");
-}
-
-// A debugging function to dump the contents of memory
-void
-runtime_memorydump(void)
-{
-	uint32 spanidx;
-
-	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
-		dumpspan(spanidx);
-	}
-}
-
-void
-runtime_gchelper(void)
-{
-	uint32 nproc;
-
-	runtime_m()->traceback = 2;
-	gchelperstart();
-
-	// parallel mark for over gc roots
-	runtime_parfordo(work.markfor);
-
-	// help other threads scan secondary blocks
-	scanblock(nil, true);
-
-	bufferList[runtime_m()->helpgc].busy = 0;
-	nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
-	if(runtime_xadd(&work.ndone, +1) == nproc-1)
-		runtime_notewakeup(&work.alldone);
-	runtime_m()->traceback = 0;
-}
-
-static void
-cachestats(void)
-{
-	MCache *c;
-	P *p, **pp;
-
-	for(pp=runtime_getAllP(); (p=*pp) != nil; pp++) {
-		c = p->mcache;
-		if(c==nil)
-			continue;
-		runtime_purgecachedstats(c);
-	}
-}
-
-static void
-flushallmcaches(void)
-{
-	P *p, **pp;
-	MCache *c;
-
-	// Flush MCache's to MCentral.
-	for(pp=runtime_getAllP(); (p=*pp) != nil; pp++) {
-		c = p->mcache;
-		if(c==nil)
-			continue;
-		runtime_MCache_ReleaseAll(c);
-	}
-}
-
-void
-runtime_updatememstats(GCStats *stats)
-{
-	M *mp;
-	MSpan *s;
-	uint32 i;
-	uint64 stacks_inuse, smallfree;
-	uint64 *src, *dst;
-	MStats *pmstats;
-
-	if(stats)
-		runtime_memclr((byte*)stats, sizeof(*stats));
-	stacks_inuse = 0;
-	for(mp=runtime_getallm(); mp; mp=mp->alllink) {
-		//stacks_inuse += mp->stackinuse*FixedStack;
-		if(stats) {
-			src = (uint64*)&mp->gcstats;
-			dst = (uint64*)stats;
-			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
-				dst[i] += src[i];
-			runtime_memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
-		}
-	}
-	pmstats = mstats();
-	pmstats->stacks_inuse = stacks_inuse;
-	pmstats->mcache_inuse = runtime_mheap.cachealloc.inuse;
-	pmstats->mspan_inuse = runtime_mheap.spanalloc.inuse;
-	pmstats->sys = pmstats->heap_sys + pmstats->stacks_sys + pmstats->mspan_sys +
-		pmstats->mcache_sys + pmstats->buckhash_sys + pmstats->gc_sys + pmstats->other_sys;
-	
-	// Calculate memory allocator stats.
-	// During program execution we only count number of frees and amount of freed memory.
-	// Current number of alive object in the heap and amount of alive heap memory
-	// are calculated by scanning all spans.
-	// Total number of mallocs is calculated as number of frees plus number of alive objects.
-	// Similarly, total amount of allocated memory is calculated as amount of freed memory
-	// plus amount of alive heap memory.
-	pmstats->alloc = 0;
-	pmstats->total_alloc = 0;
-	pmstats->nmalloc = 0;
-	pmstats->nfree = 0;
-	for(i = 0; i < nelem(pmstats->by_size); i++) {
-		pmstats->by_size[i].nmalloc = 0;
-		pmstats->by_size[i].nfree = 0;
-	}
-
-	// Flush MCache's to MCentral.
-	flushallmcaches();
-
-	// Aggregate local stats.
-	cachestats();
-
-	// Scan all spans and count number of alive objects.
-	for(i = 0; i < runtime_mheap.nspan; i++) {
-		s = runtime_mheap.allspans[i];
-		if(s->state != MSpanInUse)
-			continue;
-		if(s->sizeclass == 0) {
-			pmstats->nmalloc++;
-			pmstats->alloc += s->elemsize;
-		} else {
-			pmstats->nmalloc += s->ref;
-			pmstats->by_size[s->sizeclass].nmalloc += s->ref;
-			pmstats->alloc += s->ref*s->elemsize;
-		}
-	}
-
-	// Aggregate by size class.
-	smallfree = 0;
-	pmstats->nfree = runtime_mheap.nlargefree;
-	for(i = 0; i < nelem(pmstats->by_size); i++) {
-		pmstats->nfree += runtime_mheap.nsmallfree[i];
-		pmstats->by_size[i].nfree = runtime_mheap.nsmallfree[i];
-		pmstats->by_size[i].nmalloc += runtime_mheap.nsmallfree[i];
-		smallfree += runtime_mheap.nsmallfree[i] * runtime_class_to_size[i];
-	}
-	pmstats->nmalloc += pmstats->nfree;
-
-	// Calculate derived stats.
-	pmstats->total_alloc = pmstats->alloc + runtime_mheap.largefree + smallfree;
-	pmstats->heap_alloc = pmstats->alloc;
-	pmstats->heap_objects = pmstats->nmalloc - pmstats->nfree;
-}
-
-// Structure of arguments passed to function gc().
-// This allows the arguments to be passed via runtime_mcall.
-struct gc_args
-{
-	int64 start_time; // start time of GC in ns (just before stoptheworld)
-	bool  eagersweep;
-};
-
-static void gc(struct gc_args *args);
-static void mgc(G *gp);
-static FuncVal mgcGo = { (void(*)(void))mgc };
-
-static int32
-readgogc(void)
-{
-	String s;
-	const byte *p;
-
-	s = runtime_getenv("GOGC");
-	if(s.len == 0)
-		return 100;
-	p = s.str;
-	if(s.len == 3 && runtime_strcmp((const char *)p, "off") == 0)
-		return -1;
-	return runtime_atoi(p, s.len);
-}
-
-// force = 1 - do GC regardless of current heap usage
-// force = 2 - go GC and eager sweep
-void
-runtime_gc(int32 force)
-{
-	M *m;
-	G *g;
-	struct gc_args a;
-	int32 i;
-	MStats *pmstats;
-
-	// The atomic operations are not atomic if the uint64s
-	// are not aligned on uint64 boundaries. This has been
-	// a problem in the past.
-	if((((uintptr)&work.wempty) & 7) != 0)
-		runtime_throw("runtime: gc work buffer is misaligned");
-	if((((uintptr)&work.full) & 7) != 0)
-		runtime_throw("runtime: gc work buffer is misaligned");
-
-	// Make sure all registers are saved on stack so that
-	// scanstack sees them.
-	__builtin_unwind_init();
-
-	// The gc is turned off (via enablegc) until
-	// the bootstrap has completed.
-	// Also, malloc gets called in the guts
-	// of a number of libraries that might be
-	// holding locks.  To avoid priority inversion
-	// problems, don't bother trying to run gc
-	// while holding a lock.  The next mallocgc
-	// without a lock will do the gc instead.
-	m = runtime_m();
-	pmstats = mstats();
-	if(!pmstats->enablegc || runtime_g() == m->g0 || m->locks > 0 || runtime_panicking() || m->preemptoff.len > 0)
-		return;
-
-	if(gcpercent == GcpercentUnknown) {	// first time through
-		runtime_lock(&runtime_mheap);
-		if(gcpercent == GcpercentUnknown)
-			gcpercent = readgogc();
-		runtime_unlock(&runtime_mheap);
-	}
-	if(gcpercent < 0)
-		return;
-
-	runtime_acquireWorldsema();
-	if(force==0 && pmstats->heap_alloc < pmstats->next_gc) {
-		// typically threads which lost the race to grab
-		// worldsema exit here when gc is done.
-		runtime_releaseWorldsema();
-		return;
-	}
-
-	// Ok, we're doing it!  Stop everybody else
-	a.start_time = runtime_nanotime();
-	a.eagersweep = force >= 2;
-	m->gcing = 1;
-	runtime_callStopTheWorldWithSema();
-	
-	clearpools();
-
-	// Run gc on the g0 stack.  We do this so that the g stack
-	// we're currently running on will no longer change.  Cuts
-	// the root set down a bit (g0 stacks are not scanned, and
-	// we don't need to scan gc's internal state).  Also an
-	// enabler for copyable stacks.
-	for(i = 0; i < (runtime_debug.gctrace > 1 ? 2 : 1); i++) {
-		if(i > 0)
-			a.start_time = runtime_nanotime();
-		// switch to g0, call gc(&a), then switch back
-		g = runtime_g();
-		g->param = &a;
-		g->atomicstatus = _Gwaiting;
-		g->waitreason = runtime_gostringnocopy((const byte*)"garbage collection");
-		runtime_mcall(&mgcGo);
-		m = runtime_m();
-	}
-
-	// all done
-	m->gcing = 0;
-	m->locks++;
-	runtime_releaseWorldsema();
-	runtime_callStartTheWorldWithSema();
-	m->locks--;
-
-	// now that gc is done, kick off finalizer thread if needed
-	if(!ConcurrentSweep) {
-		// give the queued finalizers, if any, a chance to run
-		runtime_gosched();
-	} else {
-		// For gccgo, let other goroutines run.
-		runtime_gosched();
-	}
-}
-
-static void
-mgc(G *gp)
-{
-	gc(gp->param);
-	gp->param = nil;
-	gp->atomicstatus = _Grunning;
-	runtime_gogo(gp);
-}
-
-static void
-gc(struct gc_args *args)
-{
-	M *m;
-	int64 tm0, tm1, tm2, tm3, tm4;
-	uint64 heap0, heap1, obj, ninstr;
-	GCStats stats;
-	uint32 i;
-	MStats *pmstats;
-	// Eface eface;
-
-	m = runtime_m();
-
-	if(runtime_debug.allocfreetrace)
-		runtime_tracegc();
-
-	m->traceback = 2;
-	tm0 = args->start_time;
-	work.tstart = args->start_time; 
-
-	if(CollectStats)
-		runtime_memclr((byte*)&gcstats, sizeof(gcstats));
-
-	m->locks++;	// disable gc during mallocs in parforalloc
-	if(work.markfor == nil)
-		work.markfor = runtime_parforalloc(_MaxGcproc);
-	m->locks--;
-
-	tm1 = 0;
-	if(runtime_debug.gctrace)
-		tm1 = runtime_nanotime();
-
-	// Sweep what is not sweeped by bgsweep.
-	while(runtime_sweepone() != (uintptr)-1)
-		gcstats.npausesweep++;
-
-	work.nwait = 0;
-	work.ndone = 0;
-	work.nproc = runtime_gcprocs();
-	runtime_parforsetup(work.markfor, work.nproc, RootCount + runtime_getallglen(), false, &markroot_funcval);
-	if(work.nproc > 1) {
-		runtime_noteclear(&work.alldone);
-		runtime_helpgc(work.nproc);
-	}
-
-	tm2 = 0;
-	if(runtime_debug.gctrace)
-		tm2 = runtime_nanotime();
-
-	gchelperstart();
-	runtime_parfordo(work.markfor);
-	scanblock(nil, true);
-
-	tm3 = 0;
-	if(runtime_debug.gctrace)
-		tm3 = runtime_nanotime();
-
-	bufferList[m->helpgc].busy = 0;
-	if(work.nproc > 1)
-		runtime_notesleep(&work.alldone);
-
-	cachestats();
-	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
-	// estimate what was live heap size after previous GC (for tracing only)
-	pmstats = mstats();
-	heap0 = pmstats->next_gc*100/(gcpercent+100);
-	// conservatively set next_gc to high value assuming that everything is live
-	// concurrent/lazy sweep will reduce this number while discovering new garbage
-	pmstats->next_gc = pmstats->heap_alloc+(pmstats->heap_alloc-runtime_stacks_sys)*gcpercent/100;
-
-	tm4 = runtime_nanotime();
-	pmstats->last_gc = runtime_unixnanotime();  // must be Unix time to make sense to user
-	pmstats->pause_ns[pmstats->numgc%nelem(pmstats->pause_ns)] = tm4 - tm0;
-	pmstats->pause_end[pmstats->numgc%nelem(pmstats->pause_end)] = pmstats->last_gc;
-	pmstats->pause_total_ns += tm4 - tm0;
-	pmstats->numgc++;
-	if(pmstats->debuggc)
-		runtime_printf("pause %D\n", tm4-tm0);
-
-	if(runtime_debug.gctrace) {
-		heap1 = pmstats->heap_alloc;
-		runtime_updatememstats(&stats);
-		if(heap1 != pmstats->heap_alloc) {
-			runtime_printf("runtime: mstats skew: heap=%D/%D\n", heap1, pmstats->heap_alloc);
-			runtime_throw("mstats skew");
-		}
-		obj = pmstats->nmalloc - pmstats->nfree;
-
-		stats.nprocyield += work.markfor->nprocyield;
-		stats.nosyield += work.markfor->nosyield;
-		stats.nsleep += work.markfor->nsleep;
-
-		runtime_printf("gc%d(%d): %D+%D+%D+%D us, %D -> %D MB, %D (%D-%D) objects,"
-				" %d/%d/%d sweeps,"
-				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
-			pmstats->numgc, work.nproc, (tm1-tm0)/1000, (tm2-tm1)/1000, (tm3-tm2)/1000, (tm4-tm3)/1000,
-			heap0>>20, heap1>>20, obj,
-			pmstats->nmalloc, pmstats->nfree,
-			sweep.nspan, gcstats.nbgsweep, gcstats.npausesweep,
-			stats.nhandoff, stats.nhandoffcnt,
-			work.markfor->nsteal, work.markfor->nstealcnt,
-			stats.nprocyield, stats.nosyield, stats.nsleep);
-		gcstats.nbgsweep = gcstats.npausesweep = 0;
-		if(CollectStats) {
-			runtime_printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
-				gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
-			if(gcstats.ptr.cnt != 0)
-				runtime_printf("avg ptrbufsize: %D (%D/%D)\n",
-					gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
-			if(gcstats.obj.cnt != 0)
-				runtime_printf("avg nobj: %D (%D/%D)\n",
-					gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
-			runtime_printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);
-
-			runtime_printf("instruction counts:\n");
-			ninstr = 0;
-			for(i=0; i<nelem(gcstats.instr); i++) {
-				runtime_printf("\t%d:\t%D\n", i, gcstats.instr[i]);
-				ninstr += gcstats.instr[i];
-			}
-			runtime_printf("\ttotal:\t%D\n", ninstr);
-
-			runtime_printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
-
-			runtime_printf("markonly base lookup: bit %D word %D span %D\n", gcstats.markonly.foundbit, gcstats.markonly.foundword, gcstats.markonly.foundspan);
-			runtime_printf("flushptrbuf base lookup: bit %D word %D span %D\n", gcstats.flushptrbuf.foundbit, gcstats.flushptrbuf.foundword, gcstats.flushptrbuf.foundspan);
-		}
-	}
-
-	// We cache current runtime_mheap.allspans array in sweep.spans,
-	// because the former can be resized and freed.
-	// Otherwise we would need to take heap lock every time
-	// we want to convert span index to span pointer.
-
-	// Free the old cached array if necessary.
-	if(sweep.spans && sweep.spans != runtime_mheap.allspans)
-		runtime_SysFree(sweep.spans, sweep.nspan*sizeof(sweep.spans[0]), &pmstats->other_sys);
-	// Cache the current array.
-	runtime_mheap.sweepspans = runtime_mheap.allspans;
-	runtime_mheap.sweepgen += 2;
-	runtime_mheap.sweepdone = false;
-	sweep.spans = runtime_mheap.allspans;
-	sweep.nspan = runtime_mheap.nspan;
-	sweep.spanidx = 0;
-
-	// Temporary disable concurrent sweep, because we see failures on builders.
-	if(ConcurrentSweep && !args->eagersweep) {
-		runtime_lock(&gclock);
-		if(sweep.g == nil)
-			sweep.g = __go_go(bgsweep, nil);
-		else if(sweep.parked) {
-			sweep.parked = false;
-			runtime_ready(sweep.g, 0, true);
-		}
-		runtime_unlock(&gclock);
-	} else {
-		// Sweep all spans eagerly.
-		while(runtime_sweepone() != (uintptr)-1)
-			gcstats.npausesweep++;
-		// Do an additional mProf_GC, because all 'free' events are now real as well.
-		runtime_MProf_GC();
-	}
-
-	runtime_MProf_GC();
-	m->traceback = 0;
-}
-
-void runtime_debug_readGCStats(Slice*)
-  __asm__("runtime_debug.readGCStats");
-
-void
-runtime_debug_readGCStats(Slice *pauses)
-{
-	uint64 *p;
-	uint32 i, n;
-	MStats *pmstats;
-
-	// Calling code in runtime/debug should make the slice large enough.
-	pmstats = mstats();
-	if((size_t)pauses->cap < nelem(pmstats->pause_ns)+3)
-		runtime_throw("runtime: short slice passed to readGCStats");
-
-	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
-	p = (uint64*)pauses->array;
-	runtime_lock(&runtime_mheap);
-	n = pmstats->numgc;
-	if(n > nelem(pmstats->pause_ns))
-		n = nelem(pmstats->pause_ns);
-	
-	// The pause buffer is circular. The most recent pause is at
-	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
-	// from there to go back farther in time. We deliver the times
-	// most recent first (in p[0]).
-	for(i=0; i<n; i++) {
-		p[i] = pmstats->pause_ns[(pmstats->numgc-1-i)%nelem(pmstats->pause_ns)];
-		p[n+i] = pmstats->pause_end[(pmstats->numgc-1-i)%nelem(pmstats->pause_ns)];
-	}
-
-	p[n+n] = pmstats->last_gc;
-	p[n+n+1] = pmstats->numgc;
-	p[n+n+2] = pmstats->pause_total_ns;
-	runtime_unlock(&runtime_mheap);
-	pauses->__count = n+n+3;
-}
-
-int32
-runtime_setgcpercent(int32 in) {
-	int32 out;
-
-	runtime_lock(&runtime_mheap);
-	if(gcpercent == GcpercentUnknown)
-		gcpercent = readgogc();
-	out = gcpercent;
-	if(in < 0)
-		in = -1;
-	gcpercent = in;
-	runtime_unlock(&runtime_mheap);
-	return out;
-}
-
-static void
-gchelperstart(void)
-{
-	M *m;
-
-	m = runtime_m();
-	if(m->helpgc < 0 || m->helpgc >= _MaxGcproc)
-		runtime_throw("gchelperstart: bad m->helpgc");
-	if(runtime_xchg(&bufferList[m->helpgc].busy, 1))
-		runtime_throw("gchelperstart: already busy");
-	if(runtime_g() != m->g0)
-		runtime_throw("gchelper not running on g0 stack");
-}
-
-void
-runtime_marknogc(void *v)
-{
-	uintptr *b, off, shift;
-
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	*b = (*b & ~(bitAllocated<<shift)) | bitBlockBoundary<<shift;
-}
-
-void
-runtime_markscan(void *v)
-{
-	uintptr *b, off, shift;
-
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	*b |= bitScan<<shift;
-}
-
-// mark the block at v as freed.
-void
-runtime_markfreed(void *v)
-{
-	uintptr *b, off, shift;
-
-	if(0)
-		runtime_printf("markfreed %p\n", v);
-
-	if((byte*)v > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
-		runtime_throw("markfreed: bad pointer");
-
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	*b = (*b & ~(bitMask<<shift)) | (bitAllocated<<shift);
-}
-
-// check that the block at v of size n is marked freed.
-void
-runtime_checkfreed(void *v, uintptr n)
-{
-	uintptr *b, bits, off, shift;
-
-	if(!runtime_checking)
-		return;
-
-	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
-		return;	// not allocated, so okay
-
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-
-	bits = *b>>shift;
-	if((bits & bitAllocated) != 0) {
-		runtime_printf("checkfreed %p+%p: off=%p have=%p\n",
-			v, n, off, bits & bitMask);
-		runtime_throw("checkfreed: not freed");
-	}
-}
-
-// mark the span of memory at v as having n blocks of the given size.
-// if leftover is true, there is left over space at the end of the span.
-void
-runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
-{
-	uintptr *b, *b0, off, shift, i, x;
-	byte *p;
-
-	if((byte*)v+size*n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
-		runtime_throw("markspan: bad pointer");
-
-	if(runtime_checking) {
-		// bits should be all zero at the start
-		off = (byte*)v + size - runtime_mheap.arena_start;
-		b = (uintptr*)(runtime_mheap.arena_start - off/wordsPerBitmapWord);
-		for(i = 0; i < size/PtrSize/wordsPerBitmapWord; i++) {
-			if(b[i] != 0)
-				runtime_throw("markspan: span bits not zero");
-		}
-	}
-
-	p = v;
-	if(leftover)	// mark a boundary just past end of last block too
-		n++;
-
-	b0 = nil;
-	x = 0;
-	for(; n-- > 0; p += size) {
-		// Okay to use non-atomic ops here, because we control
-		// the entire span, and each bitmap word has bits for only
-		// one span, so no other goroutines are changing these
-		// bitmap words.
-		off = (uintptr*)p - (uintptr*)runtime_mheap.arena_start;  // word offset
-		b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		if(b0 != b) {
-			if(b0 != nil)
-				*b0 = x;
-			b0 = b;
-			x = 0;
-		}
-		x |= bitAllocated<<shift;
-	}
-	*b0 = x;
-}
-
-// unmark the span of memory at v of length n bytes.
-void
-runtime_unmarkspan(void *v, uintptr n)
-{
-	uintptr *p, *b, off;
-
-	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
-		runtime_throw("markspan: bad pointer");
-
-	p = v;
-	off = p - (uintptr*)runtime_mheap.arena_start;  // word offset
-	if(off % wordsPerBitmapWord != 0)
-		runtime_throw("markspan: unaligned pointer");
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	n /= PtrSize;
-	if(n%wordsPerBitmapWord != 0)
-		runtime_throw("unmarkspan: unaligned length");
-	// Okay to use non-atomic ops here, because we control
-	// the entire span, and each bitmap word has bits for only
-	// one span, so no other goroutines are changing these
-	// bitmap words.
-	n /= wordsPerBitmapWord;
-	while(n-- > 0)
-		*b-- = 0;
-}
-
-void
-runtime_MHeap_MapBits(MHeap *h)
-{
-	size_t page_size;
-
-	// Caller has added extra mappings to the arena.
-	// Add extra mappings of bitmap words as needed.
-	// We allocate extra bitmap pieces in chunks of bitmapChunk.
-	enum {
-		bitmapChunk = 8192
-	};
-	uintptr n;
-
-	n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
-	n = ROUND(n, bitmapChunk);
-	n = ROUND(n, PageSize);
-	page_size = getpagesize();
-	n = ROUND(n, page_size);
-	if(h->bitmap_mapped >= n)
-		return;
-
-	runtime_SysMap(h->arena_start - n, n - h->bitmap_mapped, h->arena_reserved, &mstats()->gc_sys);
-	h->bitmap_mapped = n;
-}
diff --git a/libgo/runtime/mgc0.h b/libgo/runtime/mgc0.h
deleted file mode 100644
index 16000d1..0000000
--- a/libgo/runtime/mgc0.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Garbage collector (GC)
-
-// GC instruction opcodes.
-//
-// The opcode of an instruction is followed by zero or more
-// arguments to the instruction.
-//
-// Meaning of arguments:
-//   off      Offset (in bytes) from the start of the current object
-//   objgc    Pointer to GC info of an object
-//   objgcrel Offset to GC info of an object
-//   len      Length of an array
-//   elemsize Size (in bytes) of an element
-//   size     Size (in bytes)
-//
-// NOTE: There is a copy of these in ../reflect/type.go.
-// They must be kept in sync.
-enum {
-	GC_END,         // End of object, loop or subroutine. Args: none
-	GC_PTR,         // A typed pointer. Args: (off, objgc)
-	GC_APTR,        // Pointer to an arbitrary object. Args: (off)
-	GC_ARRAY_START, // Start an array with a fixed length. Args: (off, len, elemsize)
-	GC_ARRAY_NEXT,  // The next element of an array. Args: none
-	GC_CALL,        // Call a subroutine. Args: (off, objgcrel)
-	GC_CHAN_PTR,    // Go channel. Args: (off, ChanType*)
-	GC_STRING,      // Go string. Args: (off)
-	GC_EFACE,       // interface{}. Args: (off)
-	GC_IFACE,       // interface{...}. Args: (off)
-	GC_SLICE,       // Go slice. Args: (off, objgc)
-	GC_REGION,      // A region/part of the current object. Args: (off, size, objgc)
-
-	GC_NUM_INSTR,   // Number of instruction opcodes
-};
-
-enum {
-	// Size of GC's fixed stack.
-	//
-	// The current GC implementation permits:
-	//  - at most 1 stack allocation because of GC_CALL
-	//  - at most GC_STACK_CAPACITY allocations because of GC_ARRAY_START
-	GC_STACK_CAPACITY = 8,	
-};
-
-enum {
-	ScanStackByFrames = 1,
-	IgnorePreciseGC = 0,
-
-	// Four bits per word (see #defines below).
-	wordsPerBitmapWord = sizeof(void*)*8/4,
-	bitShift = sizeof(void*)*8/4,
-};
-
-// Bits in per-word bitmap.
-// #defines because enum might not be able to hold the values.
-//
-// Each word in the bitmap describes wordsPerBitmapWord words
-// of heap memory.  There are 4 bitmap bits dedicated to each heap word,
-// so on a 64-bit system there is one bitmap word per 16 heap words.
-// The bits in the word are packed together by type first, then by
-// heap location, so each 64-bit bitmap word consists of, from top to bottom,
-// the 16 bitMarked bits for the corresponding heap words,
-// then the 16 bitScan/bitBlockBoundary bits, then the 16 bitAllocated bits.
-// This layout makes it easier to iterate over the bits of a given type.
-//
-// The bitmap starts at mheap.arena_start and extends *backward* from
-// there.  On a 64-bit system the off'th word in the arena is tracked by
-// the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
-// the only difference is that the divisor is 8.)
-//
-// To pull out the bits corresponding to a given pointer p, we use:
-//
-//	off = p - (uintptr*)mheap.arena_start;  // word offset
-//	b = (uintptr*)mheap.arena_start - off/wordsPerBitmapWord - 1;
-//	shift = off % wordsPerBitmapWord
-//	bits = *b >> shift;
-//	/* then test bits & bitAllocated, bits & bitMarked, etc. */
-//
-#define bitAllocated		((uintptr)1<<(bitShift*0))	/* block start; eligible for garbage collection */
-#define bitScan			((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
-#define bitMarked		((uintptr)1<<(bitShift*2))	/* when bitAllocated is set */
-#define bitBlockBoundary	((uintptr)1<<(bitShift*1))	/* when bitAllocated is NOT set - mark for FlagNoGC objects */
-
-#define bitMask (bitAllocated | bitScan | bitMarked)
diff --git a/libgo/runtime/mheap.c b/libgo/runtime/mheap.c
deleted file mode 100644
index 80b9c8f..0000000
--- a/libgo/runtime/mheap.c
+++ /dev/null
@@ -1,895 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Page heap.
-//
-// See malloc.h for overview.
-//
-// When a MSpan is in the heap free list, state == MSpanFree
-// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
-//
-// When a MSpan is allocated, state == MSpanInUse
-// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-static MSpan *MHeap_AllocLocked(MHeap*, uintptr, int32);
-static bool MHeap_Grow(MHeap*, uintptr);
-static void MHeap_FreeLocked(MHeap*, MSpan*);
-static MSpan *MHeap_AllocLarge(MHeap*, uintptr);
-static MSpan *BestFit(MSpan*, uintptr, MSpan*);
-
-static void
-RecordSpan(void *vh, byte *p)
-{
-	MHeap *h;
-	MSpan *s;
-	MSpan **all;
-	uint32 cap;
-
-	h = vh;
-	s = (MSpan*)p;
-	if(h->nspan >= h->nspancap) {
-		cap = 64*1024/sizeof(all[0]);
-		if(cap < h->nspancap*3/2)
-			cap = h->nspancap*3/2;
-		all = (MSpan**)runtime_SysAlloc(cap*sizeof(all[0]), &mstats()->other_sys);
-		if(all == nil)
-			runtime_throw("runtime: cannot allocate memory");
-		if(h->allspans) {
-			runtime_memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
-			// Don't free the old array if it's referenced by sweep.
-			// See the comment in mgc0.c.
-			if(h->allspans != runtime_mheap.sweepspans)
-				runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats()->other_sys);
-		}
-		h->allspans = all;
-		h->nspancap = cap;
-	}
-	h->allspans[h->nspan++] = s;
-}
-
-// Initialize the heap; fetch memory using alloc.
-void
-runtime_MHeap_Init(MHeap *h)
-{
-	MStats *pmstats;
-	uint32 i;
-
-	pmstats = mstats();
-	runtime_FixAlloc_Init(&h->spanalloc, sizeof(MSpan), RecordSpan, h, &pmstats->mspan_sys);
-	runtime_FixAlloc_Init(&h->cachealloc, sizeof(MCache), nil, nil, &pmstats->mcache_sys);
-	runtime_FixAlloc_Init(&h->specialfinalizeralloc, sizeof(SpecialFinalizer), nil, nil, &pmstats->other_sys);
-	runtime_FixAlloc_Init(&h->specialprofilealloc, sizeof(SpecialProfile), nil, nil, &pmstats->other_sys);
-	// h->mapcache needs no init
-	for(i=0; i<nelem(h->free); i++) {
-		runtime_MSpanList_Init(&h->free[i]);
-		runtime_MSpanList_Init(&h->busy[i]);
-	}
-	runtime_MSpanList_Init(&h->freelarge);
-	runtime_MSpanList_Init(&h->busylarge);
-	for(i=0; i<nelem(h->central); i++)
-		runtime_MCentral_Init(&h->central[i], i);
-}
-
-void
-runtime_MHeap_MapSpans(MHeap *h)
-{
-	uintptr pagesize;
-	uintptr n;
-
-	// Map spans array, PageSize at a time.
-	n = (uintptr)h->arena_used;
-	n -= (uintptr)h->arena_start;
-	n = n / PageSize * sizeof(h->spans[0]);
-	n = ROUND(n, PageSize);
-	pagesize = getpagesize();
-	n = ROUND(n, pagesize);
-	if(h->spans_mapped >= n)
-		return;
-	runtime_SysMap((byte*)h->spans + h->spans_mapped, n - h->spans_mapped, h->arena_reserved, &mstats()->other_sys);
-	h->spans_mapped = n;
-}
-
-// Sweeps spans in list until reclaims at least npages into heap.
-// Returns the actual number of pages reclaimed.
-static uintptr
-MHeap_ReclaimList(MHeap *h, MSpan *list, uintptr npages)
-{
-	MSpan *s;
-	uintptr n;
-	uint32 sg;
-
-	n = 0;
-	sg = runtime_mheap.sweepgen;
-retry:
-	for(s = list->next; s != list; s = s->next) {
-		if(s->sweepgen == sg-2 && runtime_cas(&s->sweepgen, sg-2, sg-1)) {
-			runtime_MSpanList_Remove(s);
-			// swept spans are at the end of the list
-			runtime_MSpanList_InsertBack(list, s);
-			runtime_unlock(h);
-			n += runtime_MSpan_Sweep(s);
-			runtime_lock(h);
-			if(n >= npages)
-				return n;
-			// the span could have been moved elsewhere
-			goto retry;
-		}
-		if(s->sweepgen == sg-1) {
-			// the span is being sweept by background sweeper, skip
-			continue;
-		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break;
-	}
-	return n;
-}
-
-// Sweeps and reclaims at least npage pages into heap.
-// Called before allocating npage pages.
-static void
-MHeap_Reclaim(MHeap *h, uintptr npage)
-{
-	uintptr reclaimed, n;
-
-	// First try to sweep busy spans with large objects of size >= npage,
-	// this has good chances of reclaiming the necessary space.
-	for(n=npage; n < nelem(h->busy); n++) {
-		if(MHeap_ReclaimList(h, &h->busy[n], npage))
-			return;  // Bingo!
-	}
-
-	// Then -- even larger objects.
-	if(MHeap_ReclaimList(h, &h->busylarge, npage))
-		return;  // Bingo!
-
-	// Now try smaller objects.
-	// One such object is not enough, so we need to reclaim several of them.
-	reclaimed = 0;
-	for(n=0; n < npage && n < nelem(h->busy); n++) {
-		reclaimed += MHeap_ReclaimList(h, &h->busy[n], npage-reclaimed);
-		if(reclaimed >= npage)
-			return;
-	}
-
-	// Now sweep everything that is not yet swept.
-	runtime_unlock(h);
-	for(;;) {
-		n = runtime_sweepone();
-		if(n == (uintptr)-1)  // all spans are swept
-			break;
-		reclaimed += n;
-		if(reclaimed >= npage)
-			break;
-	}
-	runtime_lock(h);
-}
-
-// Allocate a new span of npage pages from the heap
-// and record its size class in the HeapMap and HeapMapCache.
-MSpan*
-runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero)
-{
-	MStats *pmstats;
-	MSpan *s;
-
-	runtime_lock(h);
-	pmstats = mstats();
-	pmstats->heap_alloc += (intptr)runtime_m()->mcache->local_cachealloc;
-	runtime_m()->mcache->local_cachealloc = 0;
-	s = MHeap_AllocLocked(h, npage, sizeclass);
-	if(s != nil) {
-		pmstats->heap_inuse += npage<<PageShift;
-		if(large) {
-			pmstats->heap_objects++;
-			pmstats->heap_alloc += npage<<PageShift;
-			// Swept spans are at the end of lists.
-			if(s->npages < nelem(h->free))
-				runtime_MSpanList_InsertBack(&h->busy[s->npages], s);
-			else
-				runtime_MSpanList_InsertBack(&h->busylarge, s);
-		}
-	}
-	runtime_unlock(h);
-	if(s != nil) {
-		if(needzero && s->needzero)
-			runtime_memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
-		s->needzero = 0;
-	}
-	return s;
-}
-
-static MSpan*
-MHeap_AllocLocked(MHeap *h, uintptr npage, int32 sizeclass)
-{
-	uintptr n;
-	MSpan *s, *t;
-	PageID p;
-
-	// To prevent excessive heap growth, before allocating n pages
-	// we need to sweep and reclaim at least n pages.
-	if(!h->sweepdone)
-		MHeap_Reclaim(h, npage);
-
-	// Try in fixed-size lists up to max.
-	for(n=npage; n < nelem(h->free); n++) {
-		if(!runtime_MSpanList_IsEmpty(&h->free[n])) {
-			s = h->free[n].next;
-			goto HaveSpan;
-		}
-	}
-
-	// Best fit in list of large spans.
-	if((s = MHeap_AllocLarge(h, npage)) == nil) {
-		if(!MHeap_Grow(h, npage))
-			return nil;
-		if((s = MHeap_AllocLarge(h, npage)) == nil)
-			return nil;
-	}
-
-HaveSpan:
-	// Mark span in use.
-	if(s->state != MSpanFree)
-		runtime_throw("MHeap_AllocLocked - MSpan not free");
-	if(s->npages < npage)
-		runtime_throw("MHeap_AllocLocked - bad npages");
-	runtime_MSpanList_Remove(s);
-	runtime_atomicstore(&s->sweepgen, h->sweepgen);
-	s->state = MSpanInUse;
-	mstats()->heap_idle -= s->npages<<PageShift;
-	mstats()->heap_released -= s->npreleased<<PageShift;
-	if(s->npreleased > 0)
-		runtime_SysUsed((void*)(s->start<<PageShift), s->npages<<PageShift);
-	s->npreleased = 0;
-
-	if(s->npages > npage) {
-		// Trim extra and put it back in the heap.
-		t = runtime_FixAlloc_Alloc(&h->spanalloc);
-		runtime_MSpan_Init(t, s->start + npage, s->npages - npage);
-		s->npages = npage;
-		p = t->start;
-		p -= ((uintptr)h->arena_start>>PageShift);
-		if(p > 0)
-			h->spans[p-1] = s;
-		h->spans[p] = t;
-		h->spans[p+t->npages-1] = t;
-		t->needzero = s->needzero;
-		runtime_atomicstore(&t->sweepgen, h->sweepgen);
-		t->state = MSpanInUse;
-		MHeap_FreeLocked(h, t);
-		t->unusedsince = s->unusedsince; // preserve age
-	}
-	s->unusedsince = 0;
-
-	// Record span info, because gc needs to be
-	// able to map interior pointer to containing span.
-	s->sizeclass = sizeclass;
-	s->elemsize = (sizeclass==0 ? s->npages<<PageShift : (uintptr)runtime_class_to_size[sizeclass]);
-	s->types.compression = MTypes_Empty;
-	p = s->start;
-	p -= ((uintptr)h->arena_start>>PageShift);
-	for(n=0; n<npage; n++)
-		h->spans[p+n] = s;
-	return s;
-}
-
-// Allocate a span of exactly npage pages from the list of large spans.
-static MSpan*
-MHeap_AllocLarge(MHeap *h, uintptr npage)
-{
-	return BestFit(&h->freelarge, npage, nil);
-}
-
-// Search list for smallest span with >= npage pages.
-// If there are multiple smallest spans, take the one
-// with the earliest starting address.
-static MSpan*
-BestFit(MSpan *list, uintptr npage, MSpan *best)
-{
-	MSpan *s;
-
-	for(s=list->next; s != list; s=s->next) {
-		if(s->npages < npage)
-			continue;
-		if(best == nil
-		|| s->npages < best->npages
-		|| (s->npages == best->npages && s->start < best->start))
-			best = s;
-	}
-	return best;
-}
-
-// Try to add at least npage pages of memory to the heap,
-// returning whether it worked.
-static bool
-MHeap_Grow(MHeap *h, uintptr npage)
-{
-	uintptr ask;
-	void *v;
-	MSpan *s;
-	PageID p;
-
-	// Ask for a big chunk, to reduce the number of mappings
-	// the operating system needs to track; also amortizes
-	// the overhead of an operating system mapping.
-	// Allocate a multiple of 64kB (16 pages).
-	npage = (npage+15)&~15;
-	ask = npage<<PageShift;
-	if(ask < HeapAllocChunk)
-		ask = HeapAllocChunk;
-
-	v = runtime_MHeap_SysAlloc(h, ask);
-	if(v == nil) {
-		if(ask > (npage<<PageShift)) {
-			ask = npage<<PageShift;
-			v = runtime_MHeap_SysAlloc(h, ask);
-		}
-		if(v == nil) {
-			runtime_printf("runtime: out of memory: cannot allocate %D-byte block (%D in use)\n", (uint64)ask, mstats()->heap_sys);
-			return false;
-		}
-	}
-
-	// Create a fake "in use" span and free it, so that the
-	// right coalescing happens.
-	s = runtime_FixAlloc_Alloc(&h->spanalloc);
-	runtime_MSpan_Init(s, (uintptr)v>>PageShift, ask>>PageShift);
-	p = s->start;
-	p -= ((uintptr)h->arena_start>>PageShift);
-	h->spans[p] = s;
-	h->spans[p + s->npages - 1] = s;
-	runtime_atomicstore(&s->sweepgen, h->sweepgen);
-	s->state = MSpanInUse;
-	MHeap_FreeLocked(h, s);
-	return true;
-}
-
-// Look up the span at the given address.
-// Address is guaranteed to be in map
-// and is guaranteed to be start or end of span.
-MSpan*
-runtime_MHeap_Lookup(MHeap *h, void *v)
-{
-	uintptr p;
-	
-	p = (uintptr)v;
-	p -= (uintptr)h->arena_start;
-	return h->spans[p >> PageShift];
-}
-
-// Look up the span at the given address.
-// Address is *not* guaranteed to be in map
-// and may be anywhere in the span.
-// Map entries for the middle of a span are only
-// valid for allocated spans.  Free spans may have
-// other garbage in their middles, so we have to
-// check for that.
-MSpan*
-runtime_MHeap_LookupMaybe(MHeap *h, void *v)
-{
-	MSpan *s;
-	PageID p, q;
-
-	if((byte*)v < h->arena_start || (byte*)v >= h->arena_used)
-		return nil;
-	p = (uintptr)v>>PageShift;
-	q = p;
-	q -= (uintptr)h->arena_start >> PageShift;
-	s = h->spans[q];
-	if(s == nil || p < s->start || (uintptr)v >= s->limit || s->state != MSpanInUse)
-		return nil;
-	return s;
-}
-
-// Free the span back into the heap.
-void
-runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct)
-{
-	MStats *pmstats;
-
-	runtime_lock(h);
-	pmstats = mstats();
-	pmstats->heap_alloc += (intptr)runtime_m()->mcache->local_cachealloc;
-	runtime_m()->mcache->local_cachealloc = 0;
-	pmstats->heap_inuse -= s->npages<<PageShift;
-	if(acct) {
-		pmstats->heap_alloc -= s->npages<<PageShift;
-		pmstats->heap_objects--;
-	}
-	MHeap_FreeLocked(h, s);
-	runtime_unlock(h);
-}
-
-static void
-MHeap_FreeLocked(MHeap *h, MSpan *s)
-{
-	MSpan *t;
-	PageID p;
-
-	s->types.compression = MTypes_Empty;
-
-	if(s->state != MSpanInUse || s->ref != 0 || s->sweepgen != h->sweepgen) {
-		runtime_printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d sweepgen %d/%d\n",
-			s, s->start<<PageShift, s->state, s->ref, s->sweepgen, h->sweepgen);
-		runtime_throw("MHeap_FreeLocked - invalid free");
-	}
-	mstats()->heap_idle += s->npages<<PageShift;
-	s->state = MSpanFree;
-	runtime_MSpanList_Remove(s);
-	// Stamp newly unused spans. The scavenger will use that
-	// info to potentially give back some pages to the OS.
-	s->unusedsince = runtime_nanotime();
-	s->npreleased = 0;
-
-	// Coalesce with earlier, later spans.
-	p = s->start;
-	p -= (uintptr)h->arena_start >> PageShift;
-	if(p > 0 && (t = h->spans[p-1]) != nil && t->state != MSpanInUse) {
-		s->start = t->start;
-		s->npages += t->npages;
-		s->npreleased = t->npreleased; // absorb released pages
-		s->needzero |= t->needzero;
-		p -= t->npages;
-		h->spans[p] = s;
-		runtime_MSpanList_Remove(t);
-		t->state = MSpanDead;
-		runtime_FixAlloc_Free(&h->spanalloc, t);
-	}
-	if((p+s->npages)*sizeof(h->spans[0]) < h->spans_mapped && (t = h->spans[p+s->npages]) != nil && t->state != MSpanInUse) {
-		s->npages += t->npages;
-		s->npreleased += t->npreleased;
-		s->needzero |= t->needzero;
-		h->spans[p + s->npages - 1] = s;
-		runtime_MSpanList_Remove(t);
-		t->state = MSpanDead;
-		runtime_FixAlloc_Free(&h->spanalloc, t);
-	}
-
-	// Insert s into appropriate list.
-	if(s->npages < nelem(h->free))
-		runtime_MSpanList_Insert(&h->free[s->npages], s);
-	else
-		runtime_MSpanList_Insert(&h->freelarge, s);
-}
-
-static uintptr
-scavengelist(MSpan *list, uint64 now, uint64 limit)
-{
-	uintptr released, sumreleased, start, end, pagesize;
-	MSpan *s;
-
-	if(runtime_MSpanList_IsEmpty(list))
-		return 0;
-
-	sumreleased = 0;
-	for(s=list->next; s != list; s=s->next) {
-		if((now - s->unusedsince) > limit && s->npreleased != s->npages) {
-			released = (s->npages - s->npreleased) << PageShift;
-			mstats()->heap_released += released;
-			sumreleased += released;
-			s->npreleased = s->npages;
-
-			start = s->start << PageShift;
-			end = start + (s->npages << PageShift);
-
-			// Round start up and end down to ensure we
-			// are acting on entire pages.
-			pagesize = getpagesize();
-			start = ROUND(start, pagesize);
-			end &= ~(pagesize - 1);
-			if(end > start)
-				runtime_SysUnused((void*)start, end - start);
-		}
-	}
-	return sumreleased;
-}
-
-void scavenge(int32, uint64, uint64)
-  __asm__ (GOSYM_PREFIX "runtime.scavenge");
-
-void
-scavenge(int32 k, uint64 now, uint64 limit)
-{
-	uint32 i;
-	uintptr sumreleased;
-	MHeap *h;
-	
-	h = &runtime_mheap;
-	sumreleased = 0;
-	for(i=0; i < nelem(h->free); i++)
-		sumreleased += scavengelist(&h->free[i], now, limit);
-	sumreleased += scavengelist(&h->freelarge, now, limit);
-
-	if(runtime_debug.gctrace > 0) {
-		if(sumreleased > 0)
-			runtime_printf("scvg%d: %D MB released\n", k, (uint64)sumreleased>>20);
-		runtime_printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
-			k, mstats()->heap_inuse>>20, mstats()->heap_idle>>20, mstats()->heap_sys>>20,
-			mstats()->heap_released>>20, (mstats()->heap_sys - mstats()->heap_released)>>20);
-	}
-}
-
-void runtime_debug_freeOSMemory(void) __asm__("runtime_debug.freeOSMemory");
-
-void
-runtime_debug_freeOSMemory(void)
-{
-	runtime_gc(2);  // force GC and do eager sweep
-	runtime_lock(&runtime_mheap);
-	scavenge(-1, ~(uintptr)0, 0);
-	runtime_unlock(&runtime_mheap);
-}
-
-// Initialize a new span with the given start and npages.
-void
-runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages)
-{
-	span->next = nil;
-	span->prev = nil;
-	span->start = start;
-	span->npages = npages;
-	span->freelist = nil;
-	span->ref = 0;
-	span->sizeclass = 0;
-	span->incache = false;
-	span->elemsize = 0;
-	span->state = MSpanDead;
-	span->unusedsince = 0;
-	span->npreleased = 0;
-	span->types.compression = MTypes_Empty;
-	span->speciallock.key = 0;
-	span->specials = nil;
-	span->needzero = 0;
-	span->freebuf = nil;
-}
-
-// Initialize an empty doubly-linked list.
-void
-runtime_MSpanList_Init(MSpan *list)
-{
-	list->state = MSpanListHead;
-	list->next = list;
-	list->prev = list;
-}
-
-void
-runtime_MSpanList_Remove(MSpan *span)
-{
-	if(span->prev == nil && span->next == nil)
-		return;
-	span->prev->next = span->next;
-	span->next->prev = span->prev;
-	span->prev = nil;
-	span->next = nil;
-}
-
-bool
-runtime_MSpanList_IsEmpty(MSpan *list)
-{
-	return list->next == list;
-}
-
-void
-runtime_MSpanList_Insert(MSpan *list, MSpan *span)
-{
-	if(span->next != nil || span->prev != nil) {
-		runtime_printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
-		runtime_throw("MSpanList_Insert");
-	}
-	span->next = list->next;
-	span->prev = list;
-	span->next->prev = span;
-	span->prev->next = span;
-}
-
-void
-runtime_MSpanList_InsertBack(MSpan *list, MSpan *span)
-{
-	if(span->next != nil || span->prev != nil) {
-		runtime_printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
-		runtime_throw("MSpanList_Insert");
-	}
-	span->next = list;
-	span->prev = list->prev;
-	span->next->prev = span;
-	span->prev->next = span;
-}
-
-// Adds the special record s to the list of special records for
-// the object p.  All fields of s should be filled in except for
-// offset & next, which this routine will fill in.
-// Returns true if the special was successfully added, false otherwise.
-// (The add will fail only if a record with the same p and s->kind
-//  already exists.)
-static bool
-addspecial(void *p, Special *s)
-{
-	MSpan *span;
-	Special **t, *x;
-	uintptr offset;
-	byte kind;
-
-	span = runtime_MHeap_LookupMaybe(&runtime_mheap, p);
-	if(span == nil)
-		runtime_throw("addspecial on invalid pointer");
-
-	// Ensure that the span is swept.
-	// GC accesses specials list w/o locks. And it's just much safer.
-	runtime_m()->locks++;
-	runtime_MSpan_EnsureSwept(span);
-
-	offset = (uintptr)p - (span->start << PageShift);
-	kind = s->kind;
-
-	runtime_lock(&span->speciallock);
-
-	// Find splice point, check for existing record.
-	t = &span->specials;
-	while((x = *t) != nil) {
-		if(offset == x->offset && kind == x->kind) {
-			runtime_unlock(&span->speciallock);
-			runtime_m()->locks--;
-			return false; // already exists
-		}
-		if(offset < x->offset || (offset == x->offset && kind < x->kind))
-			break;
-		t = &x->next;
-	}
-	// Splice in record, fill in offset.
-	s->offset = offset;
-	s->next = x;
-	*t = s;
-	runtime_unlock(&span->speciallock);
-	runtime_m()->locks--;
-	return true;
-}
-
-// Removes the Special record of the given kind for the object p.
-// Returns the record if the record existed, nil otherwise.
-// The caller must FixAlloc_Free the result.
-static Special*
-removespecial(void *p, byte kind)
-{
-	MSpan *span;
-	Special *s, **t;
-	uintptr offset;
-
-	span = runtime_MHeap_LookupMaybe(&runtime_mheap, p);
-	if(span == nil)
-		runtime_throw("removespecial on invalid pointer");
-
-	// Ensure that the span is swept.
-	// GC accesses specials list w/o locks. And it's just much safer.
-	runtime_m()->locks++;
-	runtime_MSpan_EnsureSwept(span);
-
-	offset = (uintptr)p - (span->start << PageShift);
-
-	runtime_lock(&span->speciallock);
-	t = &span->specials;
-	while((s = *t) != nil) {
-		// This function is used for finalizers only, so we don't check for
-		// "interior" specials (p must be exactly equal to s->offset).
-		if(offset == s->offset && kind == s->kind) {
-			*t = s->next;
-			runtime_unlock(&span->speciallock);
-			runtime_m()->locks--;
-			return s;
-		}
-		t = &s->next;
-	}
-	runtime_unlock(&span->speciallock);
-	runtime_m()->locks--;
-	return nil;
-}
-
-// Adds a finalizer to the object p.  Returns true if it succeeded.
-bool
-runtime_addfinalizer(void *p, FuncVal *f, const FuncType *ft, const PtrType *ot)
-{
-	SpecialFinalizer *s;
-
-	runtime_lock(&runtime_mheap.speciallock);
-	s = runtime_FixAlloc_Alloc(&runtime_mheap.specialfinalizeralloc);
-	runtime_unlock(&runtime_mheap.speciallock);
-	s->kind = KindSpecialFinalizer;
-	s->fn = f;
-	s->ft = ft;
-	s->ot = ot;
-	if(addspecial(p, s))
-		return true;
-
-	// There was an old finalizer
-	runtime_lock(&runtime_mheap.speciallock);
-	runtime_FixAlloc_Free(&runtime_mheap.specialfinalizeralloc, s);
-	runtime_unlock(&runtime_mheap.speciallock);
-	return false;
-}
-
-// Removes the finalizer (if any) from the object p.
-void
-runtime_removefinalizer(void *p)
-{
-	SpecialFinalizer *s;
-
-	s = (SpecialFinalizer*)removespecial(p, KindSpecialFinalizer);
-	if(s == nil)
-		return; // there wasn't a finalizer to remove
-	runtime_lock(&runtime_mheap.speciallock);
-	runtime_FixAlloc_Free(&runtime_mheap.specialfinalizeralloc, s);
-	runtime_unlock(&runtime_mheap.speciallock);
-}
-
-// Set the heap profile bucket associated with addr to b.
-void
-runtime_setprofilebucket(void *p, Bucket *b)
-{
-	SpecialProfile *s;
-
-	runtime_lock(&runtime_mheap.speciallock);
-	s = runtime_FixAlloc_Alloc(&runtime_mheap.specialprofilealloc);
-	runtime_unlock(&runtime_mheap.speciallock);
-	s->kind = KindSpecialProfile;
-	s->b = b;
-	if(!addspecial(p, s))
-		runtime_throw("setprofilebucket: profile already set");
-}
-
-// Do whatever cleanup needs to be done to deallocate s.  It has
-// already been unlinked from the MSpan specials list.
-// Returns true if we should keep working on deallocating p.
-bool
-runtime_freespecial(Special *s, void *p, uintptr size, bool freed)
-{
-	SpecialFinalizer *sf;
-	SpecialProfile *sp;
-
-	switch(s->kind) {
-	case KindSpecialFinalizer:
-		sf = (SpecialFinalizer*)s;
-		runtime_queuefinalizer(p, sf->fn, sf->ft, sf->ot);
-		runtime_lock(&runtime_mheap.speciallock);
-		runtime_FixAlloc_Free(&runtime_mheap.specialfinalizeralloc, sf);
-		runtime_unlock(&runtime_mheap.speciallock);
-		return false; // don't free p until finalizer is done
-	case KindSpecialProfile:
-		sp = (SpecialProfile*)s;
-		runtime_MProf_Free(sp->b, size, freed);
-		runtime_lock(&runtime_mheap.speciallock);
-		runtime_FixAlloc_Free(&runtime_mheap.specialprofilealloc, sp);
-		runtime_unlock(&runtime_mheap.speciallock);
-		return true;
-	default:
-		runtime_throw("bad special kind");
-		return true;
-	}
-}
-
-// Free all special records for p.
-void
-runtime_freeallspecials(MSpan *span, void *p, uintptr size)
-{
-	Special *s, **t, *list;
-	uintptr offset;
-
-	if(span->sweepgen != runtime_mheap.sweepgen)
-		runtime_throw("runtime: freeallspecials: unswept span");
-	// first, collect all specials into the list; then, free them
-	// this is required to not cause deadlock between span->specialLock and proflock
-	list = nil;
-	offset = (uintptr)p - (span->start << PageShift);
-	runtime_lock(&span->speciallock);
-	t = &span->specials;
-	while((s = *t) != nil) {
-		if(offset + size <= s->offset)
-			break;
-		if(offset <= s->offset) {
-			*t = s->next;
-			s->next = list;
-			list = s;
-		} else
-			t = &s->next;
-	}
-	runtime_unlock(&span->speciallock);
-
-	while(list != nil) {
-		s = list;
-		list = s->next;
-		if(!runtime_freespecial(s, p, size, true))
-			runtime_throw("can't explicitly free an object with a finalizer");
-	}
-}
-
-// Split an allocated span into two equal parts.
-void
-runtime_MHeap_SplitSpan(MHeap *h, MSpan *s)
-{
-	MSpan *t;
-	MCentral *c;
-	uintptr i;
-	uintptr npages;
-	PageID p;
-
-	if(s->state != MSpanInUse)
-		runtime_throw("MHeap_SplitSpan on a free span");
-	if(s->sizeclass != 0 && s->ref != 1)
-		runtime_throw("MHeap_SplitSpan doesn't have an allocated object");
-	npages = s->npages;
-
-	// remove the span from whatever list it is in now
-	if(s->sizeclass > 0) {
-		// must be in h->central[x].mempty
-		c = &h->central[s->sizeclass];
-		runtime_lock(c);
-		runtime_MSpanList_Remove(s);
-		runtime_unlock(c);
-		runtime_lock(h);
-	} else {
-		// must be in h->busy/busylarge
-		runtime_lock(h);
-		runtime_MSpanList_Remove(s);
-	}
-	// heap is locked now
-
-	if(npages == 1) {
-		// convert span of 1 PageSize object to a span of 2 PageSize/2 objects.
-		s->ref = 2;
-		s->sizeclass = runtime_SizeToClass(PageSize/2);
-		s->elemsize = PageSize/2;
-	} else {
-		// convert span of n>1 pages into two spans of n/2 pages each.
-		if((s->npages & 1) != 0)
-			runtime_throw("MHeap_SplitSpan on an odd size span");
-
-		// compute position in h->spans
-		p = s->start;
-		p -= (uintptr)h->arena_start >> PageShift;
-
-		// Allocate a new span for the first half.
-		t = runtime_FixAlloc_Alloc(&h->spanalloc);
-		runtime_MSpan_Init(t, s->start, npages/2);
-		t->limit = (uintptr)((t->start + npages/2) << PageShift);
-		t->state = MSpanInUse;
-		t->elemsize = npages << (PageShift - 1);
-		t->sweepgen = s->sweepgen;
-		if(t->elemsize <= MaxSmallSize) {
-			t->sizeclass = runtime_SizeToClass(t->elemsize);
-			t->ref = 1;
-		}
-
-		// the old span holds the second half.
-		s->start += npages/2;
-		s->npages = npages/2;
-		s->elemsize = npages << (PageShift - 1);
-		if(s->elemsize <= MaxSmallSize) {
-			s->sizeclass = runtime_SizeToClass(s->elemsize);
-			s->ref = 1;
-		}
-
-		// update span lookup table
-		for(i = p; i < p + npages/2; i++)
-			h->spans[i] = t;
-	}
-
-	// place the span into a new list
-	if(s->sizeclass > 0) {
-		runtime_unlock(h);
-		c = &h->central[s->sizeclass];
-		runtime_lock(c);
-		// swept spans are at the end of the list
-		runtime_MSpanList_InsertBack(&c->mempty, s);
-		runtime_unlock(c);
-	} else {
-		// Swept spans are at the end of lists.
-		if(s->npages < nelem(h->free))
-			runtime_MSpanList_InsertBack(&h->busy[s->npages], s);
-		else
-			runtime_MSpanList_InsertBack(&h->busylarge, s);
-		runtime_unlock(h);
-	}
-}
diff --git a/libgo/runtime/msize.c b/libgo/runtime/msize.c
deleted file mode 100644
index b82c709..0000000
--- a/libgo/runtime/msize.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Malloc small size classes.
-//
-// See malloc.h for overview.
-//
-// The size classes are chosen so that rounding an allocation
-// request up to the next size class wastes at most 12.5% (1.125x).
-//
-// Each size class has its own page count that gets allocated
-// and chopped up when new objects of the size class are needed.
-// That page count is chosen so that chopping up the run of
-// pages into objects of the given size wastes at most 12.5% (1.125x)
-// of the memory.  It is not necessary that the cutoff here be
-// the same as above.
-//
-// The two sources of waste multiply, so the worst possible case
-// for the above constraints would be that allocations of some
-// size might have a 26.6% (1.266x) overhead.
-// In practice, only one of the wastes comes into play for a
-// given size (sizes < 512 waste mainly on the round-up,
-// sizes > 512 waste mainly on the page chopping).
-//
-// TODO(rsc): Compute max waste for any given size.
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-
-int32 runtime_class_to_size[_NumSizeClasses];
-int32 runtime_class_to_allocnpages[_NumSizeClasses];
-
-// The SizeToClass lookup is implemented using two arrays,
-// one mapping sizes <= 1024 to their class and one mapping
-// sizes >= 1024 and <= MaxSmallSize to their class.
-// All objects are 8-aligned, so the first array is indexed by
-// the size divided by 8 (rounded up).  Objects >= 1024 bytes
-// are 128-aligned, so the second array is indexed by the
-// size divided by 128 (rounded up).  The arrays are filled in
-// by InitSizes.
-
-int8 runtime_size_to_class8[1024/8 + 1];
-int8 runtime_size_to_class128[(MaxSmallSize-1024)/128 + 1];
-
-int32
-runtime_SizeToClass(int32 size)
-{
-	if(size > MaxSmallSize)
-		runtime_throw("SizeToClass - invalid size");
-	if(size > 1024-8)
-		return runtime_size_to_class128[(size-1024+127) >> 7];
-	return runtime_size_to_class8[(size+7)>>3];
-}
-
-void
-runtime_InitSizes(void)
-{
-	int32 align, sizeclass, size, nextsize, n;
-	uint32 i;
-	uintptr allocsize, npages;
-	MStats *pmstats;
-
-	// Initialize the runtime_class_to_size table (and choose class sizes in the process).
-	runtime_class_to_size[0] = 0;
-	sizeclass = 1;	// 0 means no class
-	align = 8;
-	for(size = align; size <= MaxSmallSize; size += align) {
-		if((size&(size-1)) == 0) {	// bump alignment once in a while
-			if(size >= 2048)
-				align = 256;
-			else if(size >= 128)
-				align = size / 8;
-			else if(size >= 16)
-				align = 16;	// required for x86 SSE instructions, if we want to use them
-		}
-		if((align&(align-1)) != 0)
-			runtime_throw("InitSizes - bug");
-
-		// Make the allocnpages big enough that
-		// the leftover is less than 1/8 of the total,
-		// so wasted space is at most 12.5%.
-		allocsize = PageSize;
-		while(allocsize%size > allocsize/8)
-			allocsize += PageSize;
-		npages = allocsize >> PageShift;
-
-		// If the previous sizeclass chose the same
-		// allocation size and fit the same number of
-		// objects into the page, we might as well
-		// use just this size instead of having two
-		// different sizes.
-		if(sizeclass > 1 &&
-			(int32)npages == runtime_class_to_allocnpages[sizeclass-1] &&
-			allocsize/size == allocsize/runtime_class_to_size[sizeclass-1]) {
-			runtime_class_to_size[sizeclass-1] = size;
-			continue;
-		}
-
-		runtime_class_to_allocnpages[sizeclass] = npages;
-		runtime_class_to_size[sizeclass] = size;
-		sizeclass++;
-	}
-	if(sizeclass != _NumSizeClasses) {
-		runtime_printf("sizeclass=%d _NumSizeClasses=%d\n", sizeclass, _NumSizeClasses);
-		runtime_throw("InitSizes - bad _NumSizeClasses");
-	}
-
-	// Initialize the size_to_class tables.
-	nextsize = 0;
-	for (sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++) {
-		for(; nextsize < 1024 && nextsize <= runtime_class_to_size[sizeclass]; nextsize+=8)
-			runtime_size_to_class8[nextsize/8] = sizeclass;
-		if(nextsize >= 1024)
-			for(; nextsize <= runtime_class_to_size[sizeclass]; nextsize += 128)
-				runtime_size_to_class128[(nextsize-1024)/128] = sizeclass;
-	}
-
-	// Double-check SizeToClass.
-	if(0) {
-		for(n=0; n < MaxSmallSize; n++) {
-			sizeclass = runtime_SizeToClass(n);
-			if(sizeclass < 1 || sizeclass >= _NumSizeClasses || runtime_class_to_size[sizeclass] < n) {
-				runtime_printf("size=%d sizeclass=%d runtime_class_to_size=%d\n", n, sizeclass, runtime_class_to_size[sizeclass]);
-				runtime_printf("incorrect SizeToClass");
-				goto dump;
-			}
-			if(sizeclass > 1 && runtime_class_to_size[sizeclass-1] >= n) {
-				runtime_printf("size=%d sizeclass=%d runtime_class_to_size=%d\n", n, sizeclass, runtime_class_to_size[sizeclass]);
-				runtime_printf("SizeToClass too big");
-				goto dump;
-			}
-		}
-	}
-
-	// Copy out for statistics table.
-	pmstats = mstats();
-	for(i=0; i<nelem(runtime_class_to_size); i++)
-		pmstats->by_size[i].size = runtime_class_to_size[i];
-	return;
-
-dump:
-	if(1){
-		runtime_printf("NumSizeClasses=%d\n", _NumSizeClasses);
-		runtime_printf("runtime_class_to_size:");
-		for(sizeclass=0; sizeclass<_NumSizeClasses; sizeclass++)
-			runtime_printf(" %d", runtime_class_to_size[sizeclass]);
-		runtime_printf("\n\n");
-		runtime_printf("size_to_class8:");
-		for(i=0; i<nelem(runtime_size_to_class8); i++)
-			runtime_printf(" %d=>%d(%d)\n", i*8, runtime_size_to_class8[i],
-				runtime_class_to_size[runtime_size_to_class8[i]]);
-		runtime_printf("\n");
-		runtime_printf("size_to_class128:");
-		for(i=0; i<nelem(runtime_size_to_class128); i++)
-			runtime_printf(" %d=>%d(%d)\n", i*128, runtime_size_to_class128[i],
-				runtime_class_to_size[runtime_size_to_class128[i]]);
-		runtime_printf("\n");
-	}
-	runtime_throw("InitSizes failed");
-}
-
-// Returns size of the memory block that mallocgc will allocate if you ask for the size.
-uintptr
-runtime_roundupsize(uintptr size)
-{
-	if(size < MaxSmallSize) {
-		if(size <= 1024-8)
-			return runtime_class_to_size[runtime_size_to_class8[(size+7)>>3]];
-		else
-			return runtime_class_to_size[runtime_size_to_class128[(size-1024+127) >> 7]];
-	}
-	if(size + PageSize < size)
-		return size;
-	return ROUND(size, PageSize);
-}
diff --git a/libgo/runtime/parfor.c b/libgo/runtime/parfor.c
deleted file mode 100644
index d64d74c..0000000
--- a/libgo/runtime/parfor.c
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Parallel for algorithm.
-
-#include "runtime.h"
-#include "malloc.h"
-#include "arch.h"
-
-struct ParForThread
-{
-	// the thread's iteration space [32lsb, 32msb)
-	uint64 pos __attribute__((aligned(8)));
-	// stats
-	uint64 nsteal;
-	uint64 nstealcnt;
-	uint64 nprocyield;
-	uint64 nosyield;
-	uint64 nsleep;
-	byte pad[CacheLineSize];
-};
-
-ParFor*
-runtime_parforalloc(uint32 nthrmax)
-{
-	ParFor *desc;
-
-	// The ParFor object is followed by CacheLineSize padding
-	// and then nthrmax ParForThread.
-	desc = (ParFor*)runtime_mallocgc(sizeof(ParFor) + CacheLineSize + nthrmax * sizeof(ParForThread), 0, FlagNoInvokeGC);
-	desc->thr = (ParForThread*)((byte*)(desc+1) + CacheLineSize);
-	desc->nthrmax = nthrmax;
-	return desc;
-}
-
-void
-runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, bool wait, const FuncVal *body)
-{
-	uint32 i, begin, end;
-	uint64 *pos;
-
-	if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
-		runtime_printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
-		runtime_throw("parfor: invalid args");
-	}
-
-	desc->body = body;
-	desc->done = 0;
-	desc->nthr = nthr;
-	desc->thrseq = 0;
-	desc->cnt = n;
-	desc->wait = wait;
-	desc->nsteal = 0;
-	desc->nstealcnt = 0;
-	desc->nprocyield = 0;
-	desc->nosyield = 0;
-	desc->nsleep = 0;
-	for(i=0; i<nthr; i++) {
-		begin = (uint64)n*i / nthr;
-		end = (uint64)n*(i+1) / nthr;
-		pos = &desc->thr[i].pos;
-		if(((uintptr)pos & 7) != 0)
-			runtime_throw("parforsetup: pos is not aligned");
-		*pos = (uint64)begin | (((uint64)end)<<32);
-	}
-}
-
-void
-runtime_parfordo(ParFor *desc)
-{
-	ParForThread *me;
-	uint32 tid, begin, end, begin2, try, victim, i;
-	uint64 *mypos, *victimpos, pos, newpos;
-	const FuncVal *body;
-	void (*bodyfn)(ParFor*, uint32);
-	bool idle;
-
-	// Obtain 0-based thread index.
-	tid = runtime_xadd(&desc->thrseq, 1) - 1;
-	if(tid >= desc->nthr) {
-		runtime_printf("tid=%d nthr=%d\n", tid, desc->nthr);
-		runtime_throw("parfor: invalid tid");
-	}
-
-	body = desc->body;
-	bodyfn = (void (*)(ParFor*, uint32))(void*)body->fn;
-
-	// If single-threaded, just execute the for serially.
-	if(desc->nthr==1) {
-		for(i=0; i<desc->cnt; i++)
-		  __builtin_call_with_static_chain (bodyfn(desc, i), body);
-		return;
-	}
-
-	me = &desc->thr[tid];
-	mypos = &me->pos;
-	for(;;) {
-		for(;;) {
-			// While there is local work,
-			// bump low index and execute the iteration.
-			pos = runtime_xadd64(mypos, 1);
-			begin = (uint32)pos-1;
-			end = (uint32)(pos>>32);
-			if(begin < end) {
-				__builtin_call_with_static_chain(bodyfn(desc, begin), body);
-				continue;
-			}
-			break;
-		}
-
-		// Out of work, need to steal something.
-		idle = false;
-		for(try=0;; try++) {
-			// If we don't see any work for long enough,
-			// increment the done counter...
-			if(try > desc->nthr*4 && !idle) {
-				idle = true;
-				runtime_xadd(&desc->done, 1);
-			}
-			// ...if all threads have incremented the counter,
-			// we are done.
-			if(desc->done + !idle == desc->nthr) {
-				if(!idle)
-					runtime_xadd(&desc->done, 1);
-				goto exit;
-			}
-			// Choose a random victim for stealing.
-			victim = runtime_fastrand() % (desc->nthr-1);
-			if(victim >= tid)
-				victim++;
-			victimpos = &desc->thr[victim].pos;
-			for(;;) {
-				// See if it has any work.
-				pos = runtime_atomicload64(victimpos);
-				begin = (uint32)pos;
-				end = (uint32)(pos>>32);
-				if(begin+1 >= end) {
-					begin = end = 0;
-					break;
-				}
-				if(idle) {
-					runtime_xadd(&desc->done, -1);
-					idle = false;
-				}
-				begin2 = begin + (end-begin)/2;
-				newpos = (uint64)begin | (uint64)begin2<<32;
-				if(runtime_cas64(victimpos, pos, newpos)) {
-					begin = begin2;
-					break;
-				}
-			}
-			if(begin < end) {
-				// Has successfully stolen some work.
-				if(idle)
-					runtime_throw("parfor: should not be idle");
-				runtime_atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
-				me->nsteal++;
-				me->nstealcnt += end-begin;
-				break;
-			}
-			// Backoff.
-			if(try < desc->nthr) {
-				// nothing
-			} else if (try < 4*desc->nthr) {
-				me->nprocyield++;
-				runtime_procyield(20);
-			// If a caller asked not to wait for the others, exit now
-			// (assume that most work is already done at this point).
-			} else if (!desc->wait) {
-				if(!idle)
-					runtime_xadd(&desc->done, 1);
-				goto exit;
-			} else if (try < 6*desc->nthr) {
-				me->nosyield++;
-				runtime_osyield();
-			} else {
-				me->nsleep++;
-				runtime_usleep(1);
-			}
-		}
-	}
-exit:
-	runtime_xadd64(&desc->nsteal, me->nsteal);
-	runtime_xadd64(&desc->nstealcnt, me->nstealcnt);
-	runtime_xadd64(&desc->nprocyield, me->nprocyield);
-	runtime_xadd64(&desc->nosyield, me->nosyield);
-	runtime_xadd64(&desc->nsleep, me->nsleep);
-	me->nsteal = 0;
-	me->nstealcnt = 0;
-	me->nprocyield = 0;
-	me->nosyield = 0;
-	me->nsleep = 0;
-}
-
-// For testing from Go.
-void
-runtime_parforiters(ParFor *desc, uintptr tid, uintptr *start, uintptr *end)
-{
-	*start = (uint32)desc->thr[tid].pos;
-	*end = (uint32)(desc->thr[tid].pos>>32);
-}
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index 55a8d23..cc56642 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -18,7 +18,6 @@
 #include "runtime.h"
 #include "arch.h"
 #include "defs.h"
-#include "malloc.h"
 #include "go-type.h"
 
 #ifdef USING_SPLIT_STACK
@@ -727,12 +726,12 @@
                 // 32-bit mode, the Go allocation space is all of
                 // memory anyhow.
 		if(sizeof(void*) == 8) {
-			void *p = runtime_SysAlloc(stacksize, &mstats()->other_sys);
+			void *p = runtime_sysAlloc(stacksize, &mstats()->other_sys);
 			if(p == nil)
 				runtime_throw("runtime: cannot allocate memory for goroutine stack");
 			*ret_stack = (byte*)p;
 		} else {
-			*ret_stack = runtime_mallocgc(stacksize, 0, FlagNoProfiling|FlagNoGC);
+			*ret_stack = runtime_mallocgc(stacksize, nil, false);
 			runtime_xadd(&runtime_stacks_sys, stacksize);
 		}
 		*ret_stacksize = (uintptr)stacksize;
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index b3bc03a..9f84f52 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -61,14 +61,9 @@
 typedef	struct	String		String;
 typedef	struct	FuncVal		FuncVal;
 typedef	struct	SigTab		SigTab;
-typedef	struct	mcache		MCache;
-typedef struct	FixAlloc	FixAlloc;
 typedef	struct	hchan		Hchan;
 typedef	struct	timer		Timer;
-typedef	struct	gcstats		GCStats;
 typedef	struct	lfnode		LFNode;
-typedef	struct	ParFor		ParFor;
-typedef	struct	ParForThread	ParForThread;
 typedef	struct	cgoMal		CgoMal;
 typedef	struct	PollDesc	PollDesc;
 typedef	struct	sudog		SudoG;
@@ -180,26 +175,6 @@
 };
 #endif
 
-// Parallel for descriptor.
-struct ParFor
-{
-	const FuncVal *body;		// executed for each element
-	uint32 done;			// number of idle threads
-	uint32 nthr;			// total number of threads
-	uint32 nthrmax;			// maximum number of threads
-	uint32 thrseq;			// thread id sequencer
-	uint32 cnt;			// iteration space [0, cnt)
-	bool wait;			// if true, wait while all threads finish processing,
-					// otherwise parfor may return while other threads are still working
-	ParForThread *thr;		// array of thread descriptors
-	// stats
-	uint64 nsteal __attribute__((aligned(8))); // force alignment for m68k
-	uint64 nstealcnt;
-	uint64 nprocyield;
-	uint64 nosyield;
-	uint64 nsleep;
-};
-
 extern bool runtime_copystack;
 
 /*
@@ -274,7 +249,6 @@
 int32	runtime_snprintf(byte*, int32, const char*, ...);
 #define runtime_mcmp(a, b, s) __builtin_memcmp((a), (b), (s))
 #define runtime_memmove(a, b, s) __builtin_memmove((a), (b), (s))
-void*	runtime_mal(uintptr);
 String	runtime_gostringnocopy(const byte*)
   __asm__ (GOSYM_PREFIX "runtime.gostringnocopy");
 void	runtime_schedinit(void)
@@ -300,12 +274,12 @@
   __asm__ (GOSYM_PREFIX "runtime.minit");
 void	runtime_signalstack(byte*, uintptr)
   __asm__ (GOSYM_PREFIX "runtime.signalstack");
-MCache*	runtime_allocmcache(void)
-  __asm__ (GOSYM_PREFIX "runtime.allocmcache");
-void	runtime_freemcache(MCache*)
-  __asm__ (GOSYM_PREFIX "runtime.freemcache");
 void	runtime_mallocinit(void)
   __asm__ (GOSYM_PREFIX "runtime.mallocinit");
+void*	runtime_mallocgc(uintptr, const Type*, bool)
+  __asm__ (GOSYM_PREFIX "runtime.mallocgc");
+void*	runtime_sysAlloc(uintptr, uint64*)
+  __asm__ (GOSYM_PREFIX "runtime.sysAlloc");
 void	runtime_mprofinit(void);
 #define runtime_getcallersp(p) __builtin_frame_address(0)
 void	runtime_mcall(FuncVal*)
@@ -345,8 +319,6 @@
 void	runtime_goparkunlock(Lock*, String, byte, intgo)
   __asm__ (GOSYM_PREFIX "runtime.goparkunlock");
 void	runtime_tsleep(int64, const char*);
-void	runtime_goexit1(void)
-  __asm__ (GOSYM_PREFIX "runtime.goexit1");
 void	runtime_entersyscall(int32)
   __asm__ (GOSYM_PREFIX "runtime.entersyscall");
 void	runtime_entersyscallblock(int32)
@@ -379,8 +351,6 @@
   __asm__(GOSYM_PREFIX "runtime.parsedebugvars");
 void	_rt0_go(void);
 G*	runtime_timejump(void);
-void	runtime_iterate_finq(void (*callback)(FuncVal*, void*, const FuncType*, const PtrType*))
-  __asm__(GOSYM_PREFIX "runtime.iterate_finq");
 
 void	runtime_callStopTheWorldWithSema(void)
   __asm__(GOSYM_PREFIX "runtime.callStopTheWorldWithSema");
@@ -447,18 +417,6 @@
   __asm__ (GOSYM_PREFIX "runtime.lfstackpop");
 
 /*
- * Parallel for over [0, n).
- * body() is executed for each iteration.
- * nthr - total number of worker threads.
- * if wait=true, threads return from parfor() when all work is done;
- * otherwise, threads can return while other threads are still finishing processing.
- */
-ParFor*	runtime_parforalloc(uint32 nthrmax);
-void	runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, bool wait, const FuncVal *body);
-void	runtime_parfordo(ParFor *desc);
-void	runtime_parforiters(ParFor*, uintptr, uintptr*, uintptr*);
-
-/*
  * low level C-called
  */
 #define runtime_mmap mmap
@@ -515,22 +473,6 @@
 void	runtime_check(void)
   __asm__ (GOSYM_PREFIX "runtime.check");
 
-// A list of global variables that the garbage collector must scan.
-struct gcRoot {
-	void*   decl;
-	uintptr size;
-	uintptr ptrdata;
-	void*   gcdata; // Not yet used.
-};
-
-struct gcRootList {
-	struct gcRootList* next;
-	intgo              count;
-	struct gcRoot      roots[];
-};
-
-void	__go_register_gc_roots(struct gcRootList*);
-
 // Size of stack space allocated using Go's allocator.
 // This will be 0 when using split stacks, as in that case
 // the stacks are allocated by the splitstack library.
diff --git a/libgo/runtime/runtime_c.c b/libgo/runtime/runtime_c.c
index 63eb141..336b261 100644
--- a/libgo/runtime/runtime_c.c
+++ b/libgo/runtime/runtime_c.c
@@ -133,6 +133,22 @@
   return (intgo)errno;
 }
 
+uintptr getEnd(void)
+  __asm__ (GOSYM_PREFIX "runtime.getEnd");
+
+uintptr
+getEnd()
+{
+  uintptr end = 0;
+  uintptr *pend;
+
+  pend = &__go_end;
+  if (pend != nil) {
+    end = *pend;
+  }
+  return end;
+}
+
 // CPU-specific initialization.
 // Fetch CPUID info on x86.
 
@@ -151,3 +167,14 @@
 #endif
 #endif
 }
+
+// A publication barrier: a store/store barrier.
+
+void publicationBarrier(void)
+  __asm__ (GOSYM_PREFIX "runtime.publicationBarrier");
+
+void
+publicationBarrier()
+{
+  __atomic_thread_fence(__ATOMIC_RELEASE);
+}
diff --git a/libgo/runtime/stack.c b/libgo/runtime/stack.c
new file mode 100644
index 0000000..98f98fe
--- /dev/null
+++ b/libgo/runtime/stack.c
@@ -0,0 +1,103 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Stack scanning code for the garbage collector.
+
+#include "runtime.h"
+
+#ifdef USING_SPLIT_STACK
+
+extern void * __splitstack_find (void *, void *, size_t *, void **, void **,
+				 void **);
+
+extern void * __splitstack_find_context (void *context[10], size_t *, void **,
+					 void **, void **);
+
+#endif
+
+// Calling unwind_init in doscanstack only works if it does not do a
+// tail call to doscanstack1.
+#pragma GCC optimize ("-fno-optimize-sibling-calls")
+
+extern void scanstackblock(void *addr, uintptr size, void *gcw)
+  __asm__("runtime.scanstackblock");
+
+void doscanstack(G*, void*)
+  __asm__("runtime.doscanstack");
+
+static void doscanstack1(G*, void*)
+  __attribute__ ((noinline));
+
+// Scan gp's stack, passing stack chunks to scanstackblock.
+void doscanstack(G *gp, void* gcw) {
+	// Save registers on the stack, so that if we are scanning our
+	// own stack we will see them.
+	__builtin_unwind_init();
+
+	doscanstack1(gp, gcw);
+}
+
+// Scan gp's stack after saving registers.
+static void doscanstack1(G *gp, void *gcw) {
+#ifdef USING_SPLIT_STACK
+	void* sp;
+	size_t spsize;
+	void* next_segment;
+	void* next_sp;
+	void* initial_sp;
+
+	if (gp == runtime_g()) {
+		// Scanning our own stack.
+		sp = __splitstack_find(nil, nil, &spsize, &next_segment,
+				       &next_sp, &initial_sp);
+	} else {
+		// Scanning another goroutine's stack.
+		// The goroutine is usually asleep (the world is stopped).
+
+		// The exception is that if the goroutine is about to enter or might
+		// have just exited a system call, it may be executing code such
+		// as schedlock and may have needed to start a new stack segment.
+		// Use the stack segment and stack pointer at the time of
+		// the system call instead, since that won't change underfoot.
+		if(gp->gcstack != nil) {
+			sp = gp->gcstack;
+			spsize = gp->gcstacksize;
+			next_segment = gp->gcnextsegment;
+			next_sp = gp->gcnextsp;
+			initial_sp = gp->gcinitialsp;
+		} else {
+			sp = __splitstack_find_context((void**)(&gp->stackcontext[0]),
+						       &spsize, &next_segment,
+						       &next_sp, &initial_sp);
+		}
+	}
+	if(sp != nil) {
+		scanstackblock(sp, (uintptr)(spsize), gcw);
+		while((sp = __splitstack_find(next_segment, next_sp,
+					      &spsize, &next_segment,
+					      &next_sp, &initial_sp)) != nil)
+			scanstackblock(sp, (uintptr)(spsize), gcw);
+	}
+#else
+	M *mp;
+	byte* bottom;
+	byte* top;
+
+	if(gp == runtime_g()) {
+		// Scanning our own stack.
+		bottom = (byte*)&gp;
+	} else {
+		// Scanning another goroutine's stack.
+		// The goroutine is usually asleep (the world is stopped).
+		bottom = (byte*)gp->gcnextsp;
+		if(bottom == nil)
+			return;
+	}
+	top = (byte*)gp->gcinitialsp + gp->gcstacksize;
+	if(top > bottom)
+		scanstackblock(bottom, (uintptr)(top - bottom), gcw);
+	else
+		scanstackblock(top, (uintptr)(bottom - top), gcw);
+#endif
+}