2

I have a file in the following format:

utf-8 encoded text block
separator 
binary data block

I use JavaScript's FileReader to read the file as a binary string using

FileReader.readAsBinaryString like so:

var reader = new FileReader();

reader.onload  = function(evt){
    // Here I use the separator position to divide the file content into
    // header and binary
    ...
    console.log(header);

};
FileReader.onerror = function (evt) {
    onFailure(evt.target.error.code);
}

reader.readAsBinaryString(blobFile);

The header is not parsed as UTF-8. I know that FileReader.readAsText takes the encoding of the file into account while FileReader.readAsBinaryString reads the file byte by byte.

How do I convert the header to utf8? reading the file twice, once as binary string to read the binary data and again as text to get the first block as utf8 encoded don't appeal to me.

2 Answers 2

2

I found the answer on http://snipplr.com/view/31206/: I have tested it on French characters and it converts then to utf8 without any issues.

function readUTF8String(bytes) {
  var ix = 0;

  if (bytes.slice(0, 3) == "\xEF\xBB\xBF") {
    ix = 3;
  }

  var string = "";
  for (; ix < bytes.length; ix++) {
    var byte1 = bytes[ix].charCodeAt(0);
    if (byte1 < 0x80) {
      string += String.fromCharCode(byte1);
    } else if (byte1 >= 0xC2 && byte1 < 0xE0) {
      var byte2 = bytes[++ix].charCodeAt(0);
      string += String.fromCharCode(((byte1 & 0x1F) << 6) + (byte2 & 0x3F));
    } else if (byte1 >= 0xE0 && byte1 < 0xF0) {
      var byte2 = bytes[++ix].charCodeAt(0);
      var byte3 = bytes[++ix].charCodeAt(0);
      string += String.fromCharCode(((byte1 & 0xFF) << 12) + ((byte2 & 0x3F) << 6) + (byte3 & 0x3F));
    } else if (byte1 >= 0xF0 && byte1 < 0xF5) {
      var byte2 = bytes[++ix].charCodeAt(0);
      var byte3 = bytes[++ix].charCodeAt(0);
      var byte4 = bytes[++ix].charCodeAt(0);
      var codepoint = ((byte1 & 0x07) << 18) + ((byte2 & 0x3F) << 12) + ((byte3 & 0x3F) << 6) + (byte4 & 0x3F);
      codepoint -= 0x10000;
      string += String.fromCharCode(
        (codepoint >> 10) + 0xD800, (codepoint & 0x3FF) + 0xDC00
      );
    }
  }

  return string;
}
Sign up to request clarification or add additional context in comments.

Comments

0

the result is a atring so you can iterate and convert every byte to its ascii representation using String.fromCharCode something like.

  var cursor=0
  var header=""; 
   while(cursor!=blob.length && blob[cursor]!=/*separator code*/){
    header+=String.fromCharCode(blob[cursor]);
    cursor+=1;
   }

   or

   var pos=blob.indexOf(/*separator*/);
   var header=String.fromCharCode.apply(this,blob.substr(0,pos).split(' '))

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.