Previously, I attempted to remove hashtags and comments from Twitter. The string I worked with looked like this:
@lien_ayy92 📡💯% Real 📡Avail▶#Jakarta #Bekasi 📡Excl/Incl 📡Expo▶6-7 Juli #Cirebon 📡Wajib DP💸 📡Cek BIO🔜 https://local.com/
My goal was to eliminate the icons and irrelevant text.
This is the code snippet from my previous attempt.
let clean = function (data) {
data = data.replace(/(?:https?|ftp):\/\/[\n\S]+|\B[@#]\w+\b|\b\w+[@#]\B|\B[^\w\s]{2,}\B|\b[a-zA-Z]{1,3}\b|[0-9]+|[$&+,:;=?@#|'<>.^*()%!-/]|\ud83d[\ude00-\ude4f]/g, '');
return data;
}
let stopwords = function (docs) {
docs = clean(docs);
docs = docs.trim();
docs = docs.toLowerCase();
docs = docs.split(' ');
let wordsstop = ['about'];
let docs1 = new Array;
var x = 0;
for(let i = 0; i < docs.length; i++){
if(wordsstop.indexOf(docs[i]) !== -1 || docs[i] == ""){
}else{
docs1[x] = docs[i]
x++;
}
}
return docs1;
}
console.log(stopwords('📡@lien_ayy92 📡💯% Real 📡Avail▶#Jakarta #Bekasi 📡Excl/Incl 📡Expo▶6-7 Juli #Cirebon 📡Wajib DP💸 📡Cek BIO🔜 https://local.com about data'));
I aim to achieve a result similar to this:
["real","juli","data"];