/*     */ package com.dlmu.wisdomST.operation;
/*     */ 
/*     */ import com.dlmu.wisdomST.core.ResultCollector;
/*     */ import com.dlmu.wisdomST.io.Text2;
/*     */ import com.dlmu.wisdomST.util.OperationsParams;
/*     */ import com.dlmu.wisdomST.util.Parallel;
/*     */ import java.io.IOException;
/*     */ import java.io.InputStream;
/*     */ import java.util.Arrays;
/*     */ import java.util.Iterator;
/*     */ import java.util.List;
/*     */ import java.util.Random;
/*     */ import java.util.Vector;
/*     */ import org.apache.hadoop.conf.Configuration;
/*     */ import org.apache.hadoop.fs.FSDataInputStream;
/*     */ import org.apache.hadoop.fs.FileStatus;
/*     */ import org.apache.hadoop.fs.FileSystem;
/*     */ import org.apache.hadoop.fs.Path;
/*     */ import org.apache.hadoop.io.Text;
/*     */ import org.apache.hadoop.io.compress.CodecPool;
/*     */ import org.apache.hadoop.io.compress.CompressionCodec;
/*     */ import org.apache.hadoop.io.compress.CompressionCodecFactory;
/*     */ import org.apache.hadoop.io.compress.CompressionInputStream;
/*     */ import org.apache.hadoop.io.compress.Decompressor;
/*     */ import org.apache.hadoop.io.compress.SplitCompressionInputStream;
/*     */ import org.apache.hadoop.io.compress.SplittableCompressionCodec;
/*     */ import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/*     */ import org.apache.hadoop.util.GenericOptionsParser;
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ public class LocalSampler
/*     */ {
/*     */   public static long sampleLocal(Path[] files, float ratioOrCount, ResultCollector<Text> output, Configuration conf) throws IOException, InterruptedException {
/*  57 */     Vector<FileSplit> splits = new Vector<FileSplit>(); byte b; int i; Path[] arrayOfPath;
/*  58 */     for (i = (arrayOfPath = files).length, b = 0; b < i; ) { Path file = arrayOfPath[b];
/*  59 */       FileSystem fs = file.getFileSystem(conf);
/*  60 */       if (fs.isFile(file)) {
/*     */         
/*  62 */         splits.add(new FileSplit(file, 0L, fs.getFileStatus(file).getLen(), new String[0]));
/*     */       } else {
/*     */         
/*  65 */         FileStatus[] contents = fs.listStatus(file); byte b1; int j; FileStatus[] arrayOfFileStatus1;
/*  66 */         for (j = (arrayOfFileStatus1 = contents).length, b1 = 0; b1 < j; ) { FileStatus content = arrayOfFileStatus1[b1];
/*  67 */           if (!content.isDirectory())
/*  68 */             splits.add(new FileSplit(content.getPath(), 0L, content.getLen(), new String[0]));  b1++; }
/*     */       
/*     */       }  b++; }
/*     */     
/*  72 */     return sampleLocal(splits.<FileSplit>toArray(new FileSplit[splits.size()]), ratioOrCount, output, conf);
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   public static long sampleLocal(final FileSplit[] files, final float ratioOrCount, final ResultCollector<Text> output, final Configuration conf) throws IOException, InterruptedException {
/*  89 */     long[] fileStartOffset = new long[files.length + 1];
/*  90 */     fileStartOffset[0] = 0L;
/*  91 */     for (int i = 0; i < files.length; i++) {
/*  92 */       fileStartOffset[i + 1] = fileStartOffset[i] + files[i].getLength();
/*     */     }
/*     */     
/*  95 */     final int[] sampleSizePerFile = new int[files.length];
/*  96 */     Random rand = new Random(conf.getLong("seed", System.currentTimeMillis()));
/*     */     
/*  98 */     if (ratioOrCount > 1.0F)
/*     */     {
/* 100 */       for (int j = 0; j < ratioOrCount; j++) {
/* 101 */         long sampleOffset = Math.abs(rand.nextLong()) % fileStartOffset[files.length];
/* 102 */         int iFile = Arrays.binarySearch(fileStartOffset, sampleOffset);
/*     */         
/* 104 */         if (iFile < 0)
/* 105 */           iFile = -iFile - 1 - 1; 
/* 106 */         sampleSizePerFile[iFile] = sampleSizePerFile[iFile] + 1;
/*     */       } 
/*     */     }
/*     */     
/* 110 */     List<Integer> actualSampleSizes = Parallel.forEach(files.length, new Parallel.RunnableRange<Integer>()
/*     */         {
/*     */           
/*     */           public Integer run(int i1, int i2)
/*     */           {
/* 115 */             int sampledLines = 0;
/* 116 */             for (int iFile = i1; iFile < i2; iFile++) {
/*     */               try {
/* 118 */                 long randomSeed = conf.getLong("seed", System.currentTimeMillis()) + iFile;
/* 119 */                 if (ratioOrCount > 1.0F)
/* 120 */                 { sampledLines += LocalSampler.sampleFileSplitByCount(files[iFile], conf, 
/* 121 */                       sampleSizePerFile[iFile], randomSeed, output); }
/*     */                 else
/* 123 */                 { sampledLines += LocalSampler.sampleFileSplitByRatio(files[iFile], conf, 
/* 124 */                       ratioOrCount, randomSeed, output); } 
/* 125 */               } catch (IOException e) {
/* 126 */                 throw new RuntimeException("Error while sampling file " + files[iFile]);
/*     */               } 
/*     */             } 
/* 129 */             return Integer.valueOf(sampledLines);
/*     */           }
/*     */         });
/*     */     
/* 133 */     int totalSampledLines = 0;
/* 134 */     for (Iterator<Integer> iterator = actualSampleSizes.iterator(); iterator.hasNext(); ) { int actualSampleSize = ((Integer)iterator.next()).intValue();
/* 135 */       totalSampledLines += actualSampleSize; }
/* 136 */      return totalSampledLines;
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   private static int sampleStreamByRatio(InputStream in, double ratio, long seed, ResultCollector<Text> output) throws IOException {
/* 153 */     Random rand = new Random(seed);
/* 154 */     Text2 text2 = new Text2();
/* 155 */     int sampleSize = 0;
/* 156 */     while (readUntilEOL(in, (Text)text2) > 0) {
/* 157 */       if (rand.nextDouble() < ratio) {
/* 158 */         if (output != null)
/* 159 */           output.collect(text2); 
/* 160 */         sampleSize++;
/*     */       } 
/* 162 */       text2.clear();
/*     */     } 
/*     */     
/* 165 */     return sampleSize;
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   private static int sampleStreamByCount(InputStream in, long streamLength, int count, long seed, ResultCollector<Text> output) throws IOException {
/* 178 */     Random rand = new Random(seed);
/*     */     
/* 180 */     Text2 text2 = new Text2();
/* 181 */     Text[] sample = new Text[count];
/* 182 */     long pos = 0L;
/* 183 */     int k = 0;
/*     */     
/* 185 */     while (pos < streamLength) {
/* 186 */       if (k < count) {
/*     */         
/* 188 */         pos += readUntilEOL(in, sample[k] = (Text)new Text2());
/*     */       
/*     */       }
/* 191 */       else if (rand.nextInt(k) < count) {
/*     */         
/* 193 */         int victim = rand.nextInt(count);
/* 194 */         sample[victim].clear();
/* 195 */         pos += readUntilEOL(in, sample[victim]);
/*     */       } else {
/*     */         
/* 198 */         text2.clear();
/* 199 */         pos += readUntilEOL(in, (Text)text2);
/*     */       } 
/*     */       
/* 202 */       k++;
/*     */     } 
/*     */     
/* 205 */     int sampleSize = Math.min(k, count);
/* 206 */     if (output != null) {
/* 207 */       for (int i = 0; i < sampleSize; i++) {
/* 208 */         output.collect(sample[i]);
/*     */       }
/*     */     }
/* 211 */     return sampleSize;
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed, ResultCollector<Text> output) throws IOException {
/*     */     CompressionInputStream compressionInputStream;
/* 226 */     InputStream in = null;
/* 227 */     Decompressor decompressor = null;
/*     */     try {
/* 229 */       CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
/* 230 */       CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());
/*     */ 
/*     */       
/* 233 */       FileSystem fs = file.getPath().getFileSystem(conf);
/* 234 */       FSDataInputStream fSDataInputStream = fs.open(file.getPath());
/* 235 */       int sampledLines = 0;
/*     */       
/* 237 */       if (codec != null) {
/*     */         SplitCompressionInputStream splitCompressionInputStream;
/*     */         
/* 240 */         decompressor = CodecPool.getDecompressor(codec);
/*     */         
/* 242 */         if (codec instanceof SplittableCompressionCodec) {
/*     */           
/* 244 */           SplitCompressionInputStream cIn = (
/* 245 */             (SplittableCompressionCodec)codec).createInputStream(
/* 246 */               (InputStream)fSDataInputStream, decompressor, file.getStart(), file.getStart() + file.getLength(), 
/* 247 */               SplittableCompressionCodec.READ_MODE.BYBLOCK);
/* 248 */           splitCompressionInputStream = cIn;
/*     */           
/* 250 */           long start = cIn.getAdjustedStart();
/* 251 */           long end = cIn.getAdjustedEnd();
/* 252 */           sampledLines = sampleStreamByCount((InputStream)splitCompressionInputStream, end - start, count, seed, output);
/*     */         } else {
/*     */           
/* 255 */           compressionInputStream = codec.createInputStream((InputStream)splitCompressionInputStream, decompressor);
/* 256 */           sampledLines = sampleStreamByCount((InputStream)compressionInputStream, Long.MAX_VALUE, count, seed, output);
/*     */         } 
/*     */       } else {
/* 259 */         long pos = 0L;
/*     */ 
/*     */         
/* 262 */         Random rand = new Random(seed);
/* 263 */         long[] sampleOffsets = new long[count];
/* 264 */         for (int i = 0; i < count; i++)
/* 265 */           sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart(); 
/* 266 */         Arrays.sort(sampleOffsets);
/*     */ 
/*     */         
/* 269 */         Text2 text2 = new Text2();
/* 270 */         for (int j = 0; j < count; j++) {
/* 271 */           pos += compressionInputStream.skip(sampleOffsets[j] - pos);
/*     */           
/* 273 */           text2.clear();
/* 274 */           pos += readUntilEOL((InputStream)compressionInputStream, (Text)text2);
/*     */           
/* 276 */           text2.clear();
/* 277 */           if ((pos += readUntilEOL((InputStream)compressionInputStream, (Text)text2)) > 1L) {
/* 278 */             sampledLines++;
/* 279 */             if (output != null) {
/* 280 */               output.collect(text2);
/*     */             }
/*     */           } 
/*     */         } 
/*     */       } 
/* 285 */       return sampledLines;
/*     */     } finally {
/* 287 */       if (compressionInputStream != null)
/* 288 */         compressionInputStream.close(); 
/* 289 */       if (decompressor != null) {
/* 290 */         CodecPool.returnDecompressor(decompressor);
/*     */       }
/*     */     } 
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed, ResultCollector<Text> output) throws IOException {
/*     */     CompressionInputStream compressionInputStream;
/*     */     int sampledLines;
/* 308 */     InputStream in = null;
/* 309 */     Decompressor decompressor = null;
/*     */     
/* 311 */     Text2 text2 = new Text2();
/*     */     
/*     */     try {
/* 314 */       CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
/* 315 */       CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath());
/* 316 */       FileSystem fs = file.getPath().getFileSystem(conf);
/* 317 */       FSDataInputStream fSDataInputStream = fs.open(file.getPath());
/*     */       
/* 319 */       if (codec != null) {
/*     */         SplitCompressionInputStream splitCompressionInputStream;
/*     */         
/* 322 */         decompressor = CodecPool.getDecompressor(codec);
/*     */         
/* 324 */         if (codec instanceof SplittableCompressionCodec) {
/*     */           
/* 326 */           SplitCompressionInputStream cIn = (
/* 327 */             (SplittableCompressionCodec)codec).createInputStream(
/* 328 */               (InputStream)fSDataInputStream, decompressor, file.getStart(), file.getStart() + file.getLength(), 
/* 329 */               SplittableCompressionCodec.READ_MODE.BYBLOCK);
/* 330 */           splitCompressionInputStream = cIn;
/*     */           
/* 332 */           long start = cIn.getAdjustedStart();
/* 333 */           long end = cIn.getAdjustedEnd();
/*     */           
/* 335 */           if (file.getStart() > 0L) {
/* 336 */             start += readUntilEOL((InputStream)cIn, (Text)text2);
/*     */           }
/* 338 */           sampledLines = sampleStreamByRatio((InputStream)splitCompressionInputStream, ratio, seed, output);
/*     */         } else {
/*     */           
/* 341 */           compressionInputStream = codec.createInputStream((InputStream)splitCompressionInputStream, decompressor);
/*     */ 
/*     */           
/* 344 */           sampledLines = sampleStreamByRatio((InputStream)compressionInputStream, ratio, seed, output);
/*     */         }
/*     */       
/*     */       }
/*     */       else {
/*     */         
/* 350 */         long pos = 0L;
/* 351 */         if (file.getStart() > 0L) {
/* 352 */           pos += compressionInputStream.skip(file.getStart());
/* 353 */           pos += readUntilEOL((InputStream)compressionInputStream, (Text)text2);
/*     */         } 
/*     */ 
/*     */         
/* 357 */         Random rand = new Random(seed);
/* 358 */         sampledLines = 0;
/*     */ 
/*     */         
/* 361 */         long end = file.getStart() + file.getLength();
/* 362 */         for (int i = 0; i < 10 && pos < end; i++) {
/* 363 */           text2.clear();
/* 364 */           pos += readUntilEOL((InputStream)compressionInputStream, (Text)text2);
/* 365 */           if (rand.nextFloat() < ratio) {
/* 366 */             sampledLines++;
/* 367 */             if (output != null) {
/* 368 */               output.collect(text2);
/*     */             }
/*     */           } 
/*     */         } 
/* 372 */         int averageLineSize = (int)((pos - file.getStart()) / 10L);
/* 373 */         int count = Math.round(ratio * (float)file.getLength() / averageLineSize) - sampledLines;
/* 374 */         long[] sampleOffsets = new long[count]; int j;
/* 375 */         for (j = 0; j < count; j++)
/* 376 */           sampleOffsets[j] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart(); 
/* 377 */         Arrays.sort(sampleOffsets);
/*     */ 
/*     */         
/* 380 */         for (j = 0; j < count; j++) {
/* 381 */           pos += compressionInputStream.skip(sampleOffsets[j] - pos);
/*     */           
/* 383 */           text2.clear();
/* 384 */           pos += readUntilEOL((InputStream)compressionInputStream, (Text)text2);
/*     */           
/* 386 */           text2.clear();
/* 387 */           if ((pos += readUntilEOL((InputStream)compressionInputStream, (Text)text2)) > 1L) {
/* 388 */             sampledLines++;
/* 389 */             if (output != null)
/* 390 */               output.collect(text2); 
/*     */           } 
/*     */         } 
/*     */       } 
/*     */     } finally {
/* 395 */       if (compressionInputStream != null)
/* 396 */         compressionInputStream.close(); 
/* 397 */       if (decompressor != null) {
/* 398 */         CodecPool.returnDecompressor(decompressor);
/*     */       }
/*     */     } 
/* 401 */     compressionInputStream.close();
/* 402 */     return sampledLines;
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   public static int readUntilEOL(InputStream in, Text line) throws IOException {
/* 414 */     byte[] bufferBytes = new byte[1024];
/* 415 */     int bufferLength = 0;
/*     */     do {
/* 417 */       if (bufferLength == bufferBytes.length) {
/*     */         
/* 419 */         line.append(bufferBytes, 0, bufferLength);
/* 420 */         bufferLength = 0;
/*     */       } 
/* 422 */       if (bufferLength == 0) {
/*     */         
/*     */         do {
/* 425 */           bufferBytes[0] = (byte)in.read();
/* 426 */         } while (bufferBytes[0] != -1 && (
/* 427 */           bufferBytes[0] == 10 || bufferBytes[0] == 13));
/* 428 */         if (bufferBytes[0] != -1)
/* 429 */           bufferLength++; 
/*     */       } else {
/* 431 */         bufferBytes[bufferLength++] = (byte)in.read();
/*     */       } 
/* 433 */     } while (bufferLength > 0 && 
/* 434 */       bufferBytes[bufferLength - 1] != -1 && 
/* 435 */       bufferBytes[bufferLength - 1] != 10 && bufferBytes[bufferLength - 1] != 13);
/* 436 */     if (bufferLength > 0) {
/* 437 */       bufferLength--;
/* 438 */       line.append(bufferBytes, 0, bufferLength);
/*     */     } 
/* 440 */     return line.getLength();
/*     */   }
/*     */   
/*     */   private static void printUsage() {
/* 444 */     System.out.println("Reads a random sample of an input file. Sample is written to stdout");
/* 445 */     System.out.println("Parameters (* marks required parameters):");
/* 446 */     System.out.println("<input file> - (*) Path to input file");
/* 447 */     System.out.println("shape:<s> - Type of shapes stored in the file");
/* 448 */     System.out.println("outshape:<s> - Shapes to write to output");
/* 449 */     System.out.println("ratio:<r> - ratio of random sample to read [0, 1]");
/* 450 */     System.out.println("count:<s> - approximate number of records in the sample");
/* 451 */     System.out.println("size:<s> - approximate size of the sample in bytes");
/* 452 */     System.out.println("seed:<s> - random seed to use while reading the sample");
/* 453 */     GenericOptionsParser.printGenericCommandUsage(System.out);
/*     */   }
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */ 
/*     */   
/*     */   public static void main(String[] args) throws IOException, InterruptedException {
/* 462 */     OperationsParams params = new OperationsParams(new GenericOptionsParser(args), false);
/* 463 */     Path[] inputFiles = params.getPaths();
/*     */     
/* 465 */     if (!params.checkInput()) {
/* 466 */       printUsage();
/* 467 */       System.exit(1);
/*     */     } 
/*     */     
/* 470 */     ResultCollector<Text> output = new ResultCollector<Text>()
/*     */       {
/*     */         public void collect(Text value) {
/* 473 */           System.out.println(value);
/*     */         }
/*     */       };
/*     */     
/* 477 */     float sampleRatioOrCount = params.getFloat("ratio", params.getInt("count", 0));
/*     */     
/* 479 */     long t1 = System.currentTimeMillis();
/* 480 */     long lines = sampleLocal(inputFiles, sampleRatioOrCount, output, (Configuration)params);
/* 481 */     long t2 = System.currentTimeMillis();
/* 482 */     System.out.println("Sampled " + lines + " lines in " + (t2 - t1) + " millis");
/*     */   }
/*     */ }


/* Location:              E:\大连公交集团-项目文档\公交项目相关文档\田\wisdomST-0.0.1.jar!\com\dlmu\wisdomST\operation\LocalSampler.class
 * Java compiler version: 6 (50.0)
 * JD-Core Version:       1.1.3
 */