When the FileReader reads the file as readAsArrayBuffer, it ensures that the correct encoding is used

Question

When the FileReader reads the file as readAsArrayBuffer, it ensures that the correct encoding is used

Currently, I am developing a script in JavaScript to read uploaded .csv/.xlsx files and convert the data into an array containing each row. Using FileReader along with SheetJs, I have successfully managed to achieve this by implementing the following code:

//Code for the new excel reader
$scope.do_file =  function(files)
{
    $scope.fileContent  = [];
    var X = XLSX;
    var global_wb;
    var f = files[0];
    var reader = new FileReader();
    reader.onload = function(e)
    {
        var data = e.target.result;console.log(data);
        global_wb = X.read(data, {type: 'array'});
        var output = "";
        var result = {};
        global_wb.SheetNames.forEach(function(sheetName) {
            var roa = X.utils.sheet_to_json(global_wb.Sheets[sheetName], {header:1});
            if(roa.length) result[sheetName] = roa;
        });
        $scope.fileContent =  result["Sheet1"];
        if(!result["Sheet1"])
        {
            $scope.fileContent =  result["contacts"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });
        }
    };
    reader.readAsArrayBuffer(f);
};

The above code works well for most files, but encounters difficulty when processing a file with Hebrew text encoded in Windows-1255, resulting in corrupted data.

https://i.sstatic.net/5zTu9.png

To explore alternative solutions, I attempted to read the file as text using reader.readAsText and adjust the encoding accordingly. Here is the revised code snippet:

function is_Hebrew(data)
{
    var position = data.search(/[\u0590-\u05FF]/);
    return position >= 0;
}

$scope.do_file =  function(files)
{
    var fullResult = [];
    var file = files[0];
    var reader = new FileReader();
    reader.onload = function(e){
        var data = e.target.result;
        if(!is_Hebrew(data.toString()))
        {
          reader.readAsText(file,'ISO-8859-8');   
        }
    };
    reader.readAsText(file);
    reader.onloadend = function(){
        var lines = reader.result.split('\r\n');
        console.log(lines);
        lines.forEach(element => {
            var cell = element.split(',');
            fullResult.push(cell);
        });

         console.log(reader);
    };
};

However, the modified code fails to accurately interpret the file as it does not distinguish between rows and cells. In instances where a cell contains a string with comma-separated values (e.g. "25,28,29"), the array output becomes inaccurate, treating each value as a separate cell.

Therefore, I have opted to continue using the initial method, but encounter difficulties in changing the encoding. Is there a way to modify the encoding in the original code that utilizes readAsArrayBuffer to extract the file data?

javascript angularjs csv encoding sheetjs

Answer 1

Answer №1

Through extensive exploration of potential solutions, I discovered that the most effective approach to the given question was to merge the two methods mentioned above. The first method is used for reading xlsx files, while the second method is employed for reading csv files. Additionally, a supplemental javaScript library called papaparse is utilized in the second method to address data reading challenges at the cell level.

$scope.is_Hebrew = function($data){
var position = $data.search(/[\u0590-\u05FF]/);
return position >= 0;
}

// Implementation for the new excel reader
$scope.do_file =  function(files)
{
    var config = {
    delimiter: "",  
    newline: "",    
    quoteChar: '"',
    escapeChar: '"',
    header: false,
    trimHeader: false,
    dynamicTyping: false,
    preview: 0,
    encoding: "",
    worker: false,
    comments: false,
    step: undefined,
    complete: undefined,
    error: undefined,
    download: false,
    skipEmptyLines: false,
    chunk: undefined,
    fastMode: undefined,
    beforeFirstChunk: undefined,
    withCredentials: undefined
    };

    $scope.fileContent  = [];
    var f = files[0];
    var fileExtension = f.name.replace(/^.*\./, '');
    if(fileExtension == 'xlsx')
    {
        var X = XLSX;
        var global_wb;
        var reader = new FileReader();
        reader.onload = function(e)
        {
            var data = e.target.result;
            global_wb = X.read(data, {type: 'array'});
            var result = {};
            global_wb.SheetNames.forEach(function(sheetName) {
               var roa = X.utils.sheet_to_json(global_wb.Sheets[sheetName], {header:1});
               if(roa.length) result[sheetName] = roa;
            });
            $scope.fileContent =  result["Sheet1"];
            if(!result["Sheet1"])
            {
               $scope.fileContent =  result["contacts"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });
            }

        };
        reader.readAsArrayBuffer(f);

    }
    else if(fileExtension == 'csv')
    {
    var reader = new FileReader();
    reader.onload = function(e)
    {
        var data = e.target.result;
        console.log(f);
        console.log($scope.is_Hebrew(data.toString()));
        if(!$scope.is_Hebrew(data.toString()))
        {
           reader.readAsText(f,'ISO-8859-8');   
        }
    };

    reader.readAsText(f);
    reader.onloadend = function(e){
        var c =  Papa.parse(reader.result,[ config])
        console.log(c);
        $scope.fileContent =  c["data"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });

    };

    }
    else
    {
       alert("File Not supported!");
    }

$scope.fileContent.push([]);
};

Answer 2

Through extensive exploration of potential solutions, I discovered that the most effective approach to the given question was to merge the two methods mentioned above. The first method is used for reading xlsx files, while the second method is employed for reading csv files. Additionally, a supplemental javaScript library called papaparse is utilized in the second method to address data reading challenges at the cell level.

$scope.is_Hebrew = function($data){
var position = $data.search(/[\u0590-\u05FF]/);
return position >= 0;
}

// Implementation for the new excel reader
$scope.do_file =  function(files)
{
    var config = {
    delimiter: "",  
    newline: "",    
    quoteChar: '"',
    escapeChar: '"',
    header: false,
    trimHeader: false,
    dynamicTyping: false,
    preview: 0,
    encoding: "",
    worker: false,
    comments: false,
    step: undefined,
    complete: undefined,
    error: undefined,
    download: false,
    skipEmptyLines: false,
    chunk: undefined,
    fastMode: undefined,
    beforeFirstChunk: undefined,
    withCredentials: undefined
    };

    $scope.fileContent  = [];
    var f = files[0];
    var fileExtension = f.name.replace(/^.*\./, '');
    if(fileExtension == 'xlsx')
    {
        var X = XLSX;
        var global_wb;
        var reader = new FileReader();
        reader.onload = function(e)
        {
            var data = e.target.result;
            global_wb = X.read(data, {type: 'array'});
            var result = {};
            global_wb.SheetNames.forEach(function(sheetName) {
               var roa = X.utils.sheet_to_json(global_wb.Sheets[sheetName], {header:1});
               if(roa.length) result[sheetName] = roa;
            });
            $scope.fileContent =  result["Sheet1"];
            if(!result["Sheet1"])
            {
               $scope.fileContent =  result["contacts"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });
            }

        };
        reader.readAsArrayBuffer(f);

    }
    else if(fileExtension == 'csv')
    {
    var reader = new FileReader();
    reader.onload = function(e)
    {
        var data = e.target.result;
        console.log(f);
        console.log($scope.is_Hebrew(data.toString()));
        if(!$scope.is_Hebrew(data.toString()))
        {
           reader.readAsText(f,'ISO-8859-8');   
        }
    };

    reader.readAsText(f);
    reader.onloadend = function(e){
        var c =  Papa.parse(reader.result,[ config])
        console.log(c);
        $scope.fileContent =  c["data"].filter(function(el) { return typeof el != "object" || Array.isArray(el) || Object.keys(el).length > 0; });

    };

    }
    else
    {
       alert("File Not supported!");
    }

$scope.fileContent.push([]);
};

When the FileReader reads the file as readAsArrayBuffer, it ensures that the correct encoding is used

Answer №1

Similar questions

Error: JSON parsing failed due to an unexpected token "u" at the beginning of the JSON string. This occurred in an anonymous function when

extract various information from a csv file

The problem of "undefined function appendTo" is causing issues

Using .htaccess file to optimize SEO crawling for single page applications that do not use hashbangs

Issue with Braintree drop-in form: Nonce string generation problem

What is the most efficient method for uploading and parsing extensive CSV files in node.js and express?

Interactive sidebar component with navigation and animated section markers

Utilize Vue.js to easily upload images alongside form input fields

Disabling a tooltip using the tooltip-is-open attribute is ineffective

View a specific selected newsAPI article on its own dedicated page

What is the best way to make img-fluid function properly within Bootstrap Carousel?

Troubleshooting: Images not displaying on webpage due to Ajax, JQuery, and JavaScript integration

When setting the Content-Type of an S3 object to 'image/jpeg' in NodeJS, it may appear as 'application/octet' in the S3 console

Retrieving the response value in AngularJS with $resource

Having trouble getting HTML to render properly in React JS on localhost using Apache server

Toggle the visibility of images with input radio buttons

show tab focus outline only

Failure of app script to retrieve data from an external spreadsheet

Tips for transferring HTML code to a controller

How to avoid the need to wrap all setState calls with #act in React 18?