Extract required values from a text corresponding to given string matches

I have the following string.

*******************************************************************************
*                                                                             *
*                         int. normalized  values  of  :                      *
*                         ---------------------------                         *
*                      % of irradiance at ground level                        *
*     % of direct  irr.    % of diffuse irr.    % of enviro. irr              *
*               0.488               0.418               0.093                 *
*                       reflectance at satellite level                        *
*     atm. intrin. ref.   background  ref.  pixel  reflectance                *
*               0.127               0.146               0.170                 *
*                                                                             *
*                         int. absolute values of                             *
*                         -----------------------                             *
*                      irr. at ground level (w/m2/mic)                        *
*     direct solar irr.    atm. diffuse irr.    environment  irr              *
*             592.299             507.010             113.283                 *
*                      rad at satel. level (w/m2/sr/mic)                      *
*     atm. intrin. rad.    background  rad.    pixel  radiance                *
*              58.837              67.355              78.685                 *
*                                                                             *
*                                                                             *
*                      sol. spect (in w/m2/mic)                               *
*                                2054.457                                     *
*                                                                             *
*******************************************************************************

I tried to extract values corresponding to "direct solar irr.","atm. diffuse irr."and "environment irr".

import re

def extract_values(text):
    pattern = r"direct solar irr\.\s*atm. diffuse irr\.\s*environment irr\s*([\d\.]+)\s*([\d\.]+)\s*([\d\.]+)"
    match = re.search(pattern, text)
    if match:
       return {
            "direct solar irr.": match.group(1),
            "atm. diffuse irr.": match.group(2),
            "environment irr.": match.group(3)
        }

But it produces None.

Could someone help me ?

Edit: Trying using BeautifulSoup:

enter code heredef extract_values(text):

soup = BeautifulSoup(text, 'html.parser')

# Get all text elements
lines = [line.strip() for line in soup.get_text().splitlines() if line.strip() != ""]

# Identify the line after the "direct solar irr." label
for i, line in enumerate(lines):
    if "direct solar irr." in line:
        # Look for the next line with a number
        for subsequent_line in lines[i+1:]:
            if re.search(r'\d', subsequent_line):  # Check if the line has a digit
                values = subsequent_line.split()
                return {
                    "direct solar irr.": float(values[0]),
                    "atm. diffuse irr.": float(values[1]),
                    "environment irr.": float(values[2])
                }

direct_solar_irr = extract_values(text)
direct solar irr.": float(values[0]),
ValueError: could not convert string to float: ‘*’

>Solution :

This is the regular pattern I designed. It is correct to apply this example, but I am not sure whether it can be applied to your specific scenario. This is actually a regular matching issue, not a Python code issue.

text = """
*******************************************************************************
*                                                                             *
*                         int. normalized  values  of  :                      *
*                         ---------------------------                         *
*                      % of irradiance at ground level                        *
*     % of direct  irr.    % of diffuse irr.    % of enviro. irr              *
*               0.488               0.418               0.093                 *
*                       reflectance at satellite level                        *
*     atm. intrin. ref.   background  ref.  pixel  reflectance                *
*               0.127               0.146               0.170                 *
*                                                                             *
*                         int. absolute values of                             *
*                         -----------------------                             *
*                      irr. at ground level (w/m2/mic)                        *
*     direct solar irr.    atm. diffuse irr.    environment  irr              *
*             592.299             507.010             113.283                 *
*                      rad at satel. level (w/m2/sr/mic)                      *
*     atm. intrin. rad.    background  rad.    pixel  radiance                *
*              58.837              67.355              78.685                 *
*                                                                             *
*                                                                             *
*                      sol. spect (in w/m2/mic)                               *
*                                2054.457                                     *
*                                                                             *
*******************************************************************************
"""

import re

def extract_values(text):
    # pattern = r"direct solar irr\.\s*atm. diffuse irr\.\s*environment irr\s*([\d\.]+)\s*([\d\.]+)\s*([\d\.]+)"
    pattern = r"direct solar irr\.\s*atm\. diffuse irr\.\s*environment  irr.*\n.*?\s*([\d\.]+)\s*([\d\.]+)\s*([\d\.]+)"
    match = re.search(pattern, text)
    # print(match.groups())
    if match and match.groups():
       return {
            "direct solar irr.": match.group(1),
            "atm. diffuse irr.": match.group(2),
            "environment irr.": match.group(3)
        }


if __name__ == '__main__':
    data = extract_values(text)
    print(data)

Leave a Reply