Starting from @Daniel Morelli function, I have created another function with some more possibilities.
I am sharing it here if it can be useful for someone:
//CreateCleanWords takes a string and returns a string array with all words in string
// rules:
// words of lenght >= of minAcceptedLenght
// everything between < and > is discarded
// admitted characters: numbers, letters, and all characters in validRunes map
// words not present in wordBlackList map
// word separators are space or single quote (could be improved with a map of separators)
func CreateCleanWords(s string) []string {
// Setup a string builder and allocate enough memory for the new string.
var builder strings.Builder
builder.Grow(len(s) + utf8.UTFMax)
insideTag := false // True if we are inside an HTML tag.
var c rune
var managed bool = false
var valid bool = false
var finalWords []string
var singleQuote rune = '\''
var minAcceptedLenght = 4
var wordBlackList map[string]bool = map[string]bool{
"sull": false,
"sullo": false,
"sulla": false,
"sugli": false,
"sulle": false,
"alla": false,
"all": false,
"allo": false,
"agli": false,
"alle": false,
"dell": false,
"della": false,
"dello": false,
"degli": false,
"delle": false,
"dall": false,
"dalla": false,
"dallo": false,
"dalle": false,
"dagli": false,
}
var validRunes map[rune]bool = map[rune]bool{
'à': true,
'è': true,
'é': true,
'ì': true,
'ò': true,
'ù': true,
'€': true,
'$': true,
'£': true,
'-': true,
}
for _, c = range s {
managed = false
valid = false
//show := string(c)
//fmt.Println(show)
// found < from here on ignore characters
if !managed && c == htmlTagStart {
insideTag = true
managed = true
valid = false
}
// found > characters are valid now
if !managed && c == htmlTagEnd {
insideTag = false
managed = true
valid = false
}
// if we are inside an HTML tag, we don't check anything because we won't take anything
// until we reach the tag end
if !insideTag {
if !managed && unicode.IsSpace(c) || c == singleQuote {
// found space if I have a valid word let's add it to word array
// only bigger than 3 letters
if builder.Len() >= minAcceptedLenght {
word := strings.ToLower((builder).String())
//first check if the word is not in a black list
if _, ok := wordBlackList[word]; !ok {
// the word is not in blacklist let's add to finalWords
finalWords = append(finalWords, word)
}
}
// make builder ready for next token
builder.Reset()
valid = false
managed = true
}
// letters and digits are welvome
if !managed {
valid = unicode.IsLetter(c) || unicode.IsDigit(c)
managed = valid
}
// other italian runes accepted
if !managed {
_, valid = validRunes[c]
}
if valid {
builder.WriteRune(c)
}
}
}
// remember to check the last word after exiting from for!
if builder.Len() > minAcceptedLenght {
//first check if the word is not in a black list
word := builder.String()
if _, ok := wordBlackList[word]; !ok {
// the word is not in blacklist let's add to finalWords
finalWords = append(finalWords, word)
}
builder.Reset()
}
return finalWords
}