The following code works as intended, looping over files in my folder and performing some read write actions.
The files are updated correctly too.
Issue is with the print out. It ends up printing the same set of results twice in console. Why?
import os
import csv
root_directory = '.'
for subdir, dirs, files in os.walk(root_directory):
for file in files:
if file.endswith('.csv'):
initial = {}
repeated = set()
my_file = file
sample = my_file[:2]
output_file = '{}_result.txt'.format(sample.lower())
with open(my_file, mode='r') as in_file, \
open(output_file, mode='w') as out_file:
next(in_file)
reader = csv.reader(in_file, delimiter="\t")
for i, line in enumerate(reader):
row = line[0].split(',')
if row[1] in initial:
shouldAdd = initial[row[1]] != row[2]
if shouldAdd:
repeated.add(row[1])
out_file.write('{}\n'.format(row[1])) # writing to file here no issues, no duplicates
else:
initial[row[1]] = row[2]
#Issue is here. Why printing twice
print('Total repeats for {} sample: {}'.format(sample, len(repeated)))
Prints out as follows:
Total repeats for AA sample: 123
Total repeats for BB sample: 45
Total repeats for AA sample: 123
Total repeats for BB sample: 45
Got 4 lines above. It should have stopped after line 2.
>Solution :
You’re ignoring the subdir variable. The same filename exists in a subdirectory, but you’re processing the file in the root directory twice. You need to use os.path.join(subdir, file) to get the full pathname of the file.
for subdir, dirs, files in os.walk(root_directory):
for file in files:
if file.endswith('.csv'):
fullname = os.path.join(subdir, file)
initial = {}
repeated = set()
my_file = file
sample = my_file[:2]
output_file = os.path.join(subdir, '{}_result.txt'.format(sample.lower()))
with open(fullname, mode='r') as in_file, \
open(output_file, mode='w') as out_file:
next(in_file)
reader = csv.reader(in_file, delimiter="\t")
for i, line in enumerate(reader):
row = line[0].split(',')
if row[1] in initial:
shouldAdd = initial[row[1]] != row[2]
if shouldAdd:
repeated.add(row[1])
out_file.write('{}\n'.format(row[1])) # writing to file here no issues, no duplicates
else:
initial[row[1]] = row[2]
#Issue is here. Why printing twice
print('Total repeats for {}/{} sample: {}'.format(subdir, sample, len(repeated)))