javascript convert to utf8 the result of readAsBinaryString

Question

I have a file in the following format:

utf-8 encoded text block
separator 
binary data block

I use JavaScript's FileReader to read the file as a binary string using

FileReader.readAsBinaryString like so:

var reader = new FileReader();

reader.onload  = function(evt){
    // Here I use the separator position to divide the file content into
    // header and binary
    ...
    console.log(header);

};
FileReader.onerror = function (evt) {
    onFailure(evt.target.error.code);
}

reader.readAsBinaryString(blobFile);

The header is not parsed as UTF-8. I know that FileReader.readAsText takes the encoding of the file into account while FileReader.readAsBinaryString reads the file byte by byte.

How do I convert the header to utf8? reading the file twice, once as binary string to read the binary data and again as text to get the first block as utf8 encoded don't appeal to me.

thedethfox · Accepted Answer · 2014-06-18 10:12:06Z

I found the answer on http://snipplr.com/view/31206/: I have tested it on French characters and it converts then to utf8 without any issues.

function readUTF8String(bytes) {
  var ix = 0;

  if (bytes.slice(0, 3) == "\xEF\xBB\xBF") {
    ix = 3;
  }

  var string = "";
  for (; ix < bytes.length; ix++) {
    var byte1 = bytes[ix].charCodeAt(0);
    if (byte1 < 0x80) {
      string += String.fromCharCode(byte1);
    } else if (byte1 >= 0xC2 && byte1 < 0xE0) {
      var byte2 = bytes[++ix].charCodeAt(0);
      string += String.fromCharCode(((byte1 & 0x1F) << 6) + (byte2 & 0x3F));
    } else if (byte1 >= 0xE0 && byte1 < 0xF0) {
      var byte2 = bytes[++ix].charCodeAt(0);
      var byte3 = bytes[++ix].charCodeAt(0);
      string += String.fromCharCode(((byte1 & 0xFF) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F));
    } else if (byte1 >= 0xF0 && byte1 < 0xF5) {
      var byte2 = bytes[++ix].charCodeAt(0);
      var byte3 = bytes[++ix].charCodeAt(0);
      var byte4 = bytes[++ix].charCodeAt(0);
      var codepoint = ((byte1 & 0x07) << 18) + ((byte2 & 0x3F) << 12) + ((byte3 & 0x3F) << 6) + (byte4 & 0x3F);
      codepoint -= 0x10000;
      string += String.fromCharCode(
        (codepoint >> 10) + 0xD800, (codepoint & 0x3FF) + 0xDC00
      );
    }
  }

  return string;
}

Dayan Moreno Leon · Accepted Answer · 2014-06-18 10:06:17Z

0

the result is a atring so you can iterate and convert every byte to its ascii representation using String.fromCharCode something like.

  var cursor=0
  var header=""; 
   while(cursor!=blob.length && blob[cursor]!=/*separator code*/){
    header+=String.fromCharCode(blob[cursor]);
    cursor+=1;
   }

   or

   var pos=blob.indexOf(/*separator*/);
   var header=String.fromCharCode.apply(this,blob.substr(0,pos).split(' '))

answered Jun 18, 2014 at 10:06

Dayan Moreno Leon

5,4552 gold badges25 silver badges24 bronze badges

Collectives™ on Stack Overflow

javascript convert to utf8 the result of readAsBinaryString

2 Answers 2

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related