diff --git a/data/loader.py b/data/loader.py
index 31663c9c..2e36c800 100644
--- a/data/loader.py
+++ b/data/loader.py
@@ -18,7 +18,7 @@ class PrefetchLoader:
     def __init__(self,
             loader,
             rand_erase_prob=0.,
-            rand_erase_pp=False,
+            rand_erase_mode='const',
             mean=IMAGENET_DEFAULT_MEAN,
             std=IMAGENET_DEFAULT_STD):
         self.loader = loader
@@ -26,7 +26,7 @@ class PrefetchLoader:
         self.std = torch.tensor([x * 255 for x in std]).cuda().view(1, 3, 1, 1)
         if rand_erase_prob > 0.:
             self.random_erasing = RandomErasing(
-                probability=rand_erase_prob, per_pixel=rand_erase_pp)
+                probability=rand_erase_prob, mode=rand_erase_mode)
         else:
             self.random_erasing = None
 
@@ -68,7 +68,7 @@ def create_loader(
         is_training=False,
         use_prefetcher=True,
         rand_erase_prob=0.,
-        rand_erase_pp=False,
+        rand_erase_mode='const',
         interpolation='bilinear',
         mean=IMAGENET_DEFAULT_MEAN,
         std=IMAGENET_DEFAULT_STD,
@@ -121,7 +121,7 @@ def create_loader(
         loader = PrefetchLoader(
             loader,
             rand_erase_prob=rand_erase_prob if is_training else 0.,
-            rand_erase_pp=rand_erase_pp,
+            rand_erase_mode=rand_erase_mode,
             mean=mean,
             std=std)
 
diff --git a/data/random_erasing.py b/data/random_erasing.py
index 8434179c..43f5f57e 100644
--- a/data/random_erasing.py
+++ b/data/random_erasing.py
@@ -5,7 +5,10 @@ import math
 import torch
 
 
-def _get_patch(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'):
+def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'):
+    # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
+    # paths, flip the order so normal is run on CPU if this becomes a problem
+    # ie torch.empty(patch_size, dtype=dtype).normal_().to(device=device)
     if per_pixel:
         return torch.empty(
             patch_size, dtype=dtype, device=device).normal_()
@@ -27,20 +30,29 @@ class RandomErasing:
          sl: Minimum proportion of erased area against input image.
          sh: Maximum proportion of erased area against input image.
          min_aspect: Minimum aspect ratio of erased area.
-         per_pixel: random value for each pixel in the erase region, precedence over rand_color
-         rand_color: random color for whole erase region, 0 if neither this or per_pixel set
+         mode: pixel color mode, one of 'const', 'rand', or 'pixel'
+            'const' - erase block is constant color of 0 for all channels
+            'rand'  - erase block is same per-cannel random (normal) color
+            'pixel' - erase block is per-pixel random (normal) color
     """
 
     def __init__(
             self,
             probability=0.5, sl=0.02, sh=1/3, min_aspect=0.3,
-            per_pixel=False, rand_color=False, device='cuda'):
+            mode='const', device='cuda'):
         self.probability = probability
         self.sl = sl
         self.sh = sh
         self.min_aspect = min_aspect
-        self.per_pixel = per_pixel  # per pixel random, bounded by [pl, ph]
-        self.rand_color = rand_color  # per block random, bounded by [pl, ph]
+        mode = mode.lower()
+        self.rand_color = False
+        self.per_pixel = False
+        if mode == 'rand':
+            self.rand_color = True  # per block random normal
+        elif mode == 'pixel':
+            self.per_pixel = True  # per pixel random normal
+        else:
+            assert not mode or mode == 'const'
         self.device = device
 
     def _erase(self, img, chan, img_h, img_w, dtype):
@@ -55,8 +67,9 @@ class RandomErasing:
             if w < img_w and h < img_h:
                 top = random.randint(0, img_h - h)
                 left = random.randint(0, img_w - w)
-                img[:, top:top + h, left:left + w] = _get_patch(
-                    self.per_pixel, self.rand_color, (chan, h, w), dtype=dtype, device=self.device)
+                img[:, top:top + h, left:left + w] = _get_pixels(
+                    self.per_pixel, self.rand_color, (chan, h, w),
+                    dtype=dtype, device=self.device)
                 break
 
     def __call__(self, input):
diff --git a/train.py b/train.py
index d3f7aaf1..644db18b 100644
--- a/train.py
+++ b/train.py
@@ -69,8 +69,8 @@ parser.add_argument('--drop', type=float, default=0.0, metavar='DROP',
                     help='Dropout rate (default: 0.1)')
 parser.add_argument('--reprob', type=float, default=0.4, metavar='PCT',
                     help='Random erase prob (default: 0.4)')
-parser.add_argument('--repp', action='store_true', default=False,
-                    help='Random erase per-pixel (default: False)')
+parser.add_argument('--remode', type=str, default='const',
+                    help='Random erase mode (default: "const")')
 parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                     help='learning rate (default: 0.01)')
 parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
@@ -223,7 +223,7 @@ def main():
         is_training=True,
         use_prefetcher=True,
         rand_erase_prob=args.reprob,
-        rand_erase_pp=args.repp,
+        rand_erase_mode=args.remode,
         interpolation='random',  # FIXME cleanly resolve this? data_config['interpolation'],
         mean=data_config['mean'],
         std=data_config['std'],