MATSIM
UnicodeInputStream.java
Go to the documentation of this file.
1 /* *********************************************************************** *
2  * project: org.matsim.*
3  * *
4  * *********************************************************************** *
5  * *
6  * copyright : (C) 2012 by the members listed in the COPYING, *
7  * LICENSE and WARRANTY file. *
8  * email : info at matsim dot org *
9  * *
10  * *********************************************************************** *
11  * *
12  * This program is free software; you can redistribute it and/or modify *
13  * it under the terms of the GNU General Public License as published by *
14  * the Free Software Foundation; either version 2 of the License, or *
15  * (at your option) any later version. *
16  * See also COPYING, LICENSE and WARRANTY file *
17  * *
18  * *********************************************************************** */
19 
20 package org.matsim.core.utils.io;
21 
22 // based on code from http://stackoverflow.com/questions/1835430
23 
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.PushbackInputStream;
27 
82 public class UnicodeInputStream extends InputStream {
83 
84  private final PushbackInputStream in;
85  private final BOM bom;
86 
95  public UnicodeInputStream(final InputStream inputStream) throws NullPointerException, IOException {
96  this(inputStream, true);
97  }
98 
99  public UnicodeInputStream(final InputStream inputStream, final boolean skipBom) throws NullPointerException, IOException {
100  if (inputStream == null) {
101  throw new NullPointerException("invalid input stream: null is not allowed");
102  }
103 
104  in = new PushbackInputStream(inputStream, 4);
105 
106  final byte bytes[] = new byte[4];
107  final int read = in.read(bytes);
108 
109  switch (read) {
110  case 4:
111  if ((bytes[0] == (byte) 0xFF) && (bytes[1] == (byte) 0xFE)
112  && (bytes[2] == (byte) 0x00) && (bytes[3] == (byte) 0x00)) {
113  bom = BOM.UTF_32_LE;
114  break;
115  } else if ((bytes[0] == (byte) 0x00) && (bytes[1] == (byte) 0x00)
116  && (bytes[2] == (byte) 0xFE) && (bytes[3] == (byte) 0xFF)) {
117  bom = BOM.UTF_32_BE;
118  break;
119  }
120 
121  case 3:
122  if ((bytes[0] == (byte) 0xEF) && (bytes[1] == (byte) 0xBB)
123  && (bytes[2] == (byte) 0xBF)) {
124  bom = BOM.UTF_8;
125  break;
126  }
127 
128  case 2:
129  if ((bytes[0] == (byte) 0xFF) && (bytes[1] == (byte) 0xFE)) {
130  bom = BOM.UTF_16_LE;
131  break;
132  } else if ((bytes[0] == (byte) 0xFE) && (bytes[1] == (byte) 0xFF)) {
133  bom = BOM.UTF_16_BE;
134  break;
135  }
136 
137  default:
138  bom = BOM.NONE;
139  break;
140  }
141 
142  if (read > 0) {
143  in.unread(bytes, 0, read);
144  }
145  if (skipBom) {
146  in.skip(bom.bytes.length);
147  }
148  }
149 
150  public final BOM getBOM() {
151  return bom;
152  }
153 
154  @Override
155  public int read() throws IOException {
156  return in.read();
157  }
158 
159  @Override
160  public int read(final byte b[]) throws IOException, NullPointerException {
161  return in.read(b, 0, b.length);
162  }
163 
164  @Override
165  public int read(final byte b[], final int off, final int len)
166  throws IOException, NullPointerException {
167  return in.read(b, off, len);
168  }
169 
170  @Override
171  public long skip(final long n) throws IOException {
172  return in.skip(n);
173  }
174 
175  @Override
176  public int available() throws IOException {
177  return in.available();
178  }
179 
180  @Override
181  public void close() throws IOException {
182  in.close();
183  }
184 
185  @Override
186  public synchronized void mark(final int readlimit) {
187  in.mark(readlimit);
188  }
189 
190  @Override
191  public synchronized void reset() throws IOException {
192  in.reset();
193  }
194 
195  @Override
196  public boolean markSupported() {
197  return in.markSupported();
198  }
199 
200  public static final class BOM {
201  public static final BOM NONE = new BOM(new byte[] {}, "NONE");
202  public static final BOM UTF_8 = new BOM(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }, "UTF-8");
203  public static final BOM UTF_16_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE }, "UTF-16 little-endian");
204  public static final BOM UTF_16_BE = new BOM(new byte[] { (byte) 0xFE, (byte) 0xFF }, "UTF-16 big-endian");
205  public static final BOM UTF_32_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 }, "UTF-32 little-endian");
206  public static final BOM UTF_32_BE = new BOM(new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF }, "UTF-32 big-endian");
207 
208  final byte bytes[];
209  private final String description;
210 
211  @Override
212  public final String toString() {
213  return description;
214  }
215 
216  public final byte[] getBytes() {
217  final int length = bytes.length;
218  final byte[] result = new byte[length];
219 
220  // Make a defensive copy
221  System.arraycopy(bytes, 0, result, 0, length);
222 
223  return result;
224  }
225 
226  private BOM(final byte bom[], final String description) {
227  bytes = bom;
228  this.description = description;
229  }
230  }
231 
232 }
BOM(final byte bom[], final String description)
UnicodeInputStream(final InputStream inputStream)
UnicodeInputStream(final InputStream inputStream, final boolean skipBom)
int read(final byte b[], final int off, final int len)
synchronized void mark(final int readlimit)