Mock a call function to an external module

May 31, 2022

I have a class that does some validates on a pandas dataframe I read.
The class looks something like this (simplified some stuff might make no sense)

import pandas as pd

class PandasValidator:
    read_kwargs = {'sep'='\t',header=None}
    def __init__(self,path_to_data:str,max_rows:int) -> None:
        self.path = path

def validate_num_rows(self,threshold: float = 0.1) -> bool:
    df_shape = pd.read_csv(self.path,*self.read_kwargs).shape
    return df_shape[0]*threshold <= self.max_rows

I want to test the method validate_num_rows, so I would like to patch the first line of the function, I don’t to read an actual df when testing it, my test would look something like this (this is not working code, my best attempt).

@patch('df.read_csv') #not sure what goes in here
def test_validate_num_rows(mock) -> None:
    mock.shape=(30,30)
    result = PandasValidator('dummy-path',30).validate_num_rows(0.1)
    assert result == True

To be honest I have no idea what to patch and mock or how to do it. I want to mock the first line of the validate_num_rows method. I know refactoring the code would make testing easier but that’s not a choice I have

>Solution :

Your class would be easier to test if it accepted a dataframe instead of reading it itself.

import pandas as pd

class PandasValidator:
    def __init__(self, df: pd.DataFrame, max_rows: int) -> None:
        self._df = df
        self._max_rows = max_rows

    def validate_num_rows(self, threshold: float = 0.1) -> bool:
        return self._df.shape * threshold <= self._max_rows

Now in your test, you just need to construct a dataframe in memory and pass it to PandasValidator.

Then you can make another function that reads a dataframe from a file:

import pandas as pd
from pathlib import Path

def read_csv(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep='\t', header=None)

If you want to test this function, you can use the monkeypatch fixture from pytest:

import pandas
import your_module


def test_read_csv(monkeypatch):
    expected_dataframe = # create a dataframe somehow

    def fake_read_csv(path, **kwargs):
        assert path == Path('/foo/bar')
        assert kwargs == {'sep': '\t', 'header': None}
        return expected_dataframe

    monkeypatch.setattr(pandas, "read_csv", fake_read_csv)

    actual_dataframe = your_module.read_csv(Path('/foo/bar'))
    assert actual_dataframe == expected_dataframe