I am trying to count the number of rows where any french character is present. Here is my code:
def has_french_characters(text):
# Check if text contains any French characters
french_characters = ['à', 'â', 'ç', 'é', 'è', 'ê', 'ë', 'î', 'ï', 'ô', 'û', 'ù', 'ü', 'ÿ', 'ñ', 'æ', 'œ',
'À', 'Â', 'Ç', 'É', 'È', 'Ê', 'Ë', 'Î', 'Ï', 'Ô', 'Û', 'Ù', 'Ü', 'Ÿ', 'Ñ', 'Æ', 'Œ']
return any(char in french_characters for char in text)
def count_rows_with_french_characters(filename):
total_rows = 0
rows_with_french_characters = 0
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
for line in file:
total_rows += 1
if has_french_characters(line):
rows_with_french_characters += 1
return total_rows, rows_with_french_characters
# Replace 'your_dataset.csv' with the path to your dataset file
filename = 'your_dataset.csv'
total_rows, rows_with_french_characters = count_rows_with_french_characters(filename)
print(f'Total rows in the dataset: {total_rows}')
print(f'Rows with French characters: {rows_with_french_characters}')
This is not working as it is returning 0 rows with french characters which is incorrect as I can see multiple french characters in the csv.
The detected encoding of the CSV file is: ascii
>Solution :
utf-8
modifying the string to utf format. So you are not able to see french characters.
Try with latin1
Eg:
with open(filename, 'r', encoding='latin1', errors='ignore') as file: