I’m working with NLP spacy
library and I created a function to return a list of token from a text.
import spacy
def preprocess_text_spacy(text):
stop_words = ["a", "the", "is", "are"]
nlp = spacy.load('en_core_web_sm')
tokens = set()
doc = nlp(text)
for word in doc:
if word.is_currency:
tokens.add(word.lower_)
elif len(word.lower_) == 1:
if word.is_digit and float(word.text) == 0:
tokens.add(word.text)
elif not word.is_punct and not word.is_space and not word.is_quote and not word.is_bracket and not in stop_words:
tokens.add(word.lower_)
return list(tokens)
This function is not correct because removing stop words not working.
Everything is ok only if I delete the last condition and not in stop_words
.
How to upgrade this function to remove stop words according a defined list in addition to all other condition statement?
>Solution :
Your code looks fine to me, there is a small change
at the end of elif put and str(word) not in stop_words
import spacy
def preprocess_text_spacy(text):
stop_words = ["a", "the", "is", "are"]
nlp = spacy.load('en_core_web_sm')
tokens = set()
doc = nlp(text)
print(doc)
for word in doc:
if word.is_currency:
tokens.add(word.lower_)
elif len(word.lower_) == 1:
if word.is_digit and float(word.text) == 0:
tokens.add(word.text)
elif not word.is_punct and not word.is_space and not word.is_quote and not word.is_bracket and str(word) not in stop_words:
tokens.add(word.lower_)
return list(tokens)