# Copyright (C) 2023 Intel Corporation
# SPDX-License-Identifier: MIT
import logging
import os
from abc import abstractmethod, ABC

import pandas as pd
import re
import sys
from dataclasses import dataclass
from datetime import datetime
from enum import Enum, auto
from functools import reduce
from pathlib import Path
from typing import List, Dict, Generator, Any, Optional, TextIO, Pattern, Union, Type

import numpy as np
import pytz

from mpp.core.event_info import EventInfoGenerator
from mpp.core.types import RawDataFrameColumns as rdc, RawDataFrame, EventInfoDataFrame

@dataclass
class EventPatterns:
    UNCORE_UNIT_RE: Pattern = re.compile(r'UNC_(.*?)_')
    FREERUN_RE: Pattern = re.compile(r'FREERUN_(.*?)_')
    FREERUN_SCOPED_RE: Pattern = re.compile(r'FREERUN:.*scope=(.*)')
    STATIC_SCOPED_RE: Pattern = re.compile(r'STATIC:.*scope=(.*)')

# We assume all event groups, and thus all events, are represented within the first
# n (MAX_REQUIRED_EVENT_INFO_BLOCKS) blocks of the input data file
MAX_REQUIRED_EVENT_INFO_BLOCKS = 2


@dataclass(frozen=True)
class Partition:
    """
    Represents a section of a performance data file
    """

    # Number of the first sample in the partition (first sample is 1)
    first_sample: int = 0

    # Number of the last sample in the partition
    last_sample: int = 0

    # Pointer to the first sample in the partition (value returned by <file_handle>.tell())
    first_sample_ptr: int = 0

    # Pointer to the last sample in the partition (value returned by <file_handle>.tell())
    last_sample_ptr: int = 0

    # Number of sample blocks/loops in the partition
    blocks_count: int = 0

    @property
    def total_samples(self):
        """
        The total number of samples in the partition
        """
        return self.last_sample - self.first_sample + 1


@dataclass
class ParserAttributes:
    value_separator: str
    first_sample_indicator: str
    date_format: str
    date_pattern: Pattern[str]
    sample_separator: str = '----------'
    block_separator: str = '=========='


class EventClassifier:
    """
    Given an event name, array of values for one sample of that event, and an array of unique os processor ids,
    returns the device the event is associated with

    """
    def __init__(self, name: str, values_per_sample: int, num_unique_os_processors: int, event_patterns: EventPatterns):
        self.name = name
        self.num_values = values_per_sample
        self.num_unique_os_processors = num_unique_os_processors
        self.event_patterns = event_patterns

    def classify(self):
        """
                Return device name associated with an event

                :param event_name: the event
                :return: the device name associated with the event
                """
        # TODO: store a device map somewhere instead of classiying events as a device for each line (i.e. move this
        #  function elsewhere in the parser)
        # TODO: add enumerator for event_type (SYSTEM, CORE, UNCORE), core_type can be in 'device'
        # TODO: get core_type with event
        DEVICE_CORE = 'core'
        DEVICE_UNCORE = 'UNCORE'
        DEVICE_PACKAGE = 'PACKAGE'
        SCOPE_THREAD = 'THREAD'

        match = self.event_patterns.UNCORE_UNIT_RE.search(self.name)
        if match:
            return 'UNC_' + match.group(1)

        match = self.event_patterns.FREERUN_RE.search(self.name)
        if match:
            return match.group(1)

        match = self.event_patterns.FREERUN_SCOPED_RE.search(self.name)
        if match:
            scope = match.group(1)
            if scope == DEVICE_PACKAGE:
                return 'SOCKET'
        
        match = self.event_patterns.STATIC_SCOPED_RE.search(self.name)
        if match:
            scope = match.group(1)
            if not scope:
                scope = match.group(2)
            if scope == DEVICE_PACKAGE or scope == SCOPE_THREAD:
                return scope

        if self.name.upper().startswith('UNC_') or (self.num_values - 1) != self.num_unique_os_processors:
            return DEVICE_UNCORE

        return DEVICE_CORE


class _Line:
    """
    Store a line from the data file, extracting the relevant information as class attributes.

    Extract attributes from a line in the raw data file.  Each line represents the data about a given event for
    the sample duration.  Each line may have a breakdown of the event counts of the system across
    sockets/cores/threads/channels/etc.

    """

    def __init__(self, line_values, unique_os_processors, timestamp):
        """
        Captures the key attributes of an event from a line in the data collection file.

        @param line: raw line from the input data file
        """
        # Before setting device, have line values ordered as such: [TSC, value1, value2, ...]
        self._timestamp = timestamp
        self._name = None
        self._event_patterns = None
        self._device = None
        self._tsc_count = None
        self._values = None

        self.line_values = line_values
        self._unique_os_processors = unique_os_processors

    @property
    def timestamp(self):
        return self._timestamp

    @property
    def name(self):
        # The first element is the metric/event name
        return self._name

    @property
    def device(self):
        # Device must be set after setting name
        return self._device

    @property
    def tsc_count(self):
        # The second element is the tsc clock count
        return self._tsc_count

    @property
    def values(self):
        # The rest of the elements are event counts
        return self._values

    @timestamp.setter
    def timestamp(self, value):
        self._timestamp = value

    @name.setter
    def name(self, value):
        self._name = value

    @device.setter
    def device(self, value):
        self._device = value

    @tsc_count.setter
    def tsc_count(self, value):
        self._tsc_count = value

    @values.setter
    def values(self, value):
        self._values = value

    @property
    def unique_os_processors(self):
        return self._unique_os_processors

    def _set_event_device(self) -> str:
        """
        Return device name associated with an event

        :param event_name: the event
        :return: the device name associated with the event
        """
        return EventClassifier(self.name,
                               len(self.line_values),
                               len(self.unique_os_processors),
                               self._event_patterns).classify()


class DataParser(ABC):
    """
    An abstract parser class for input data files. Implement the abstract methods to create a derived parser.
    Handles event reading, event device classification, partitioning, and dataframe generation from data files.
    """

    line_class: Type[_Line] = None

    collector: str = None

    def __init__(self, input_file: Path, timezone: pytz.tzinfo = None):
        """
        Initialize a generic data parser

        :param input_file: the data file to parse
        :param timezone: an optional timezone object for converting timestamp strings
        """
        self.input_file = input_file
        self.convert_to_datetime = None
        self._chunk_iterator = None
        self.__init_time_conversion_function(timezone)
        self.__event_info = self.__initialize_event_info()

    @property
    @abstractmethod
    def attributes(self) -> ParserAttributes:
        pass

    @property
    @abstractmethod
    def system_info(self):
        pass

    @abstractmethod
    def convert_to_datetime_from_line(self, line: str):
        pass

    @abstractmethod
    def _get_timestamps_not_found_error_class(self):
        pass

    @property
    def timestamps_not_found_error(self):
        return self._get_timestamps_not_found_error_class()

    @property
    def event_info(self) -> EventInfoDataFrame:
        """
        :return: a Pandas DataFrame with the following structure:
                     Rows: 1 row for each event
                     Columns:
                     - 'name': event name
                     - 'device': the event's device, e.g. CORE, CHA, ...

        """
        # Return a copy of the event info DataFrame so that callers cannot modify this instance
        return self.__event_info

    @property
    def first_sample_processed(self):
        """
        The first sample number in the most recent partition processed by the parser.
        """
        if not self._chunk_iterator:
            raise AttributeError('the "first_sample_processed" attribute is available only after calling the '
                                 '"event_reader" function')
        return self._chunk_iterator.sample_tracker.first_sample_number_processed

    @property
    def last_sample_processed(self):
        """
        The last sample number processed by the parser.
        """
        if not self._chunk_iterator:
            raise AttributeError('the "last_sample_processed" attribute is available only after calling the '
                                 '"event_reader" function')
        return self._chunk_iterator.sample_tracker.last_sample_number_processed

    def event_reader(self,
                     from_timestamp: datetime = None,
                     to_timestamp: datetime = None,
                     from_sample: int = None,
                     to_sample: int = None,
                     partition: Partition = None,
                     chunk_size=1,
                     exclude_invalid_events=True) -> Generator[RawDataFrame, None, None]:
        """
        Parse data and return a generator for values.

        :param from_timestamp: include only samples with timestamp equal to or greater than the specified value.
        :param to_timestamp: include only samples with timestamps equal to or less than the specified value.
        :param from_sample: include only samples equal to or greater than the specified sample number
                            (first sample is 1).
        :param to_sample: include only samples equal to or less than the specified sample number.
        :param partition: include only samples from the specified partition. Cannot be combined with any of the
                          `from` and `to` arguments.
                          Use the `DataParser.partition` function to generate partition objects.
        :param chunk_size: the maximum number of data blocks to include in each chunk returned.
                           A data block represents all event values collected in a single iteration of all
                           multiplexed event groups.
                           Setting this parameter to a high value may cause an out of memory error.
                           Setting this parameter to 0 will read the entire file into memory and may cause an
                           out of memory error.
        :return: a generator for event values, represented as a pandas dataframe with the following structure:
                 rows: a single event value
                 columns:
                   timestamp: sample timestamp
                   socket: socket/package number (0 based)
                   device: event type/source, e.g. CORE, UNCORE, ...
                   core: core number (0 based)
                   thread: hyper-thread number within the core (0 based)
                   unit: device instance number, e.g. logical core number (0 based)
                   group: the group number in which the event was collected (0 based)
                   name: event name
                   value: event value

        """

        class EventContentHandler(_ContentHandler):
            """
            A content handler for splitting event samples into chunks.
            Chunks are returned as `RawDataFrame` objects.

            See the `_ContentHandler` class for additional information.
            """

            def __init__(self, data_parser: DataParser):
                super().__init__()
                self._samples = []
                self._event_buffer = []
                self._data_parser = data_parser
                self._excluded_event_tracker = _ExcludedEventTracker()

            def end_event(self, data: str):
                self._event_buffer.append(data)

            def end_sample(self, sample_number, block_number, sample_ptr=None):
                if self._event_buffer:
                    sample_number = self._data_parser._chunk_iterator.sample_tracker.current_sample_number
                    self._samples.append(_Sample(self._data_parser, self._event_buffer,
                                                 sample_number, block_number))
                    self._event_buffer.clear()

            def get_chunk_data(self) -> Optional[RawDataFrame]:
                if not self._samples:
                    return None
                perfmon_df = self._create_raw_perfmon_df(self._samples)
                self._samples.clear()
                return perfmon_df

            def _create_raw_perfmon_df(self, samples: List[_Sample]) -> RawDataFrame:
                """
                Returns a pandas DataFrame containing a row for every event, sample, and value within the raw data file.

                @param samples: List of Sample objects, containing each individual line from the raw data capture.

                @return DataFrame: Contains a row for every event, sample, and value within the raw data file.
                """
                dataframe_builder = _EventDataFrameBuilder(self._data_parser.system_info)
                previous_sample = datetime.min
                for sample in samples:
                    if len(sample.events) == 0 and not sample.timestamp:
                        continue
                    previous_sample = self.__get_valid_timestamp(previous_sample, sample)
                    for event in sample.events.values():
                        if exclude_invalid_events and self._excluded_event_tracker.is_invalid_event(event):
                            self._excluded_event_tracker.add_exclusion(sample.timestamp, event.name)
                            continue
                        sample.detect_and_update_topdown_event_values(event.name)
                        dataframe_builder.append_event_values(sample, event)
                    dataframe_builder.append_pseudo_event_values(sample)
                events_df = dataframe_builder.to_dataframe()
                self._excluded_event_tracker.report_excluded_events()
                return RawDataFrame(events_df)

            def __get_valid_timestamp(self, previous_sample, sample):
                if not sample.timestamp:
                    return previous_sample
                if sample.timestamp <= previous_sample:
                    input_file = os.path.basename(self._data_parser.input_file)
                    raise ValueError(f'Error parsing {input_file}: Timestamp {sample.timestamp.strftime(self._data_parser.attributes.date_format)[:-3]} out of order. '
                        f'Remove out of order samples to resolve this issue. This is usually an out-of-order duplicate timestamp at the end of the input file.')
                return sample.timestamp

        def verify_preconditions():
            if self.system_info.ref_tsc == 0:
                raise ValueError('Unable to determine system frequency. '
                                 'Please provide a system information file or '
                                 'specify a value for the system frequency')

            if len(self.system_info.socket_map) == 0:
                raise ValueError('Unable to determine processor mapping. '
                                 'Please make sure the information is included in the data file, '
                                 'or provide a processor mapping file')

        verify_preconditions()
        content_handler = EventContentHandler(self)
        with open(self.input_file, 'r') as f:
            self._chunk_iterator = _ChunkIterator(self, f, content_handler, from_timestamp, to_timestamp,
                                                  from_sample, to_sample, partition, chunk_size=chunk_size)
            self._seek_first_sample_partition(f, partition)
            for chunk in self._chunk_iterator:
                yield chunk

    def partition(self,
                  from_timestamp: datetime = None,
                  to_timestamp: datetime = None,
                  from_sample: int = None,
                  to_sample: int = None,
                  chunk_size=1) -> List[Partition]:
        """
        Partition the data file into consecutive sections. This is useful for processing the file in parallel.

        See `DataParser.event_reader` for the description of all function arguments.

        :return: a list of partition objects.
        """

        class PartitionContentHandler(_ContentHandler):
            """
            A content handler for splitting event samples into chunks representing partitions.
            Chunks are returned as `Partition` objects.

            See the `_ContentHandler` class for additional information.
            """

            def __init__(self):
                super().__init__()
                self.__first_block_in_partition = None
                self.__first_sample_in_partition = None
                self.__first_sample_ptr_in_partition = None
                self.__current_sample = self.__first_sample_in_partition
                self.__current_block = self.__first_block_in_partition
                self.__current_sample_ptr = self.__first_sample_ptr_in_partition
                self.__sample_count = 0

            def end_sample(self, sample_number, block_number, sample_ptr):
                if self.__first_sample_in_partition is None:
                    self.__first_sample_in_partition = sample_number
                if self.__first_block_in_partition is None:
                    self.__first_block_in_partition = block_number
                if self.__first_sample_ptr_in_partition is None:
                    self.__first_sample_ptr_in_partition = sample_ptr

                self.__sample_count += 1
                self.__current_sample = sample_number

            def end_block(self, block_number: int):
                self.__current_block = block_number

            def get_chunk_data(self) -> Optional[Partition]:
                if self.__first_sample_in_partition > self.__current_sample:
                    return None

                partition = Partition(first_sample=self.__first_sample_in_partition,
                                      last_sample=self.__current_sample,
                                      blocks_count=self.__current_block - self.__first_block_in_partition + 1,
                                      first_sample_ptr=self.__first_sample_ptr_in_partition)
                self.__first_sample_in_partition = self.__current_sample + 1
                self.__first_block_in_partition = self.__current_block + 1
                self.__first_sample_ptr_in_partition = None
                return partition

        content_handler = PartitionContentHandler()
        with open(self.input_file, 'r') as f:
            chunk_iterator = _ChunkIterator(self, f, content_handler, from_timestamp, to_timestamp,
                                            from_sample, to_sample, chunk_size=chunk_size)
            partitions = [p for p in chunk_iterator]
            return partitions

    def __init_time_conversion_function(self, timezone: pytz.tzinfo):
        conversion_func = {
            True: lambda s: datetime.strptime(s, self.attributes.date_format),
            False: lambda s: datetime.fromtimestamp(
                timezone.localize(datetime.strptime(s, self.attributes.date_format)).timestamp())
        }
        self.convert_to_datetime = conversion_func[timezone is None]

    def __initialize_event_info(self) -> Union[EventInfoDataFrame, pd.DataFrame]:
        """
        Return a data frame with the following structure:
          Rows: 1 row for each event
          Columns: 'name' (event name), 'device' (the event's device, e.g. CORE, CHA, ...)
        or
          Empty EventInfoDataFrame if there are no events in the data file
        """
        try:
            df = next(self.event_reader(chunk_size=MAX_REQUIRED_EVENT_INFO_BLOCKS, exclude_invalid_events=False))
            return EventInfoGenerator(df).event_info
        except StopIteration:
            return EventInfoDataFrame(pd.DataFrame())
        except AttributeError:
            raise AttributeError('system information (parser._system_info), must be initialized before '
                                 'requesting event info')

    def _seek_first_sample_partition(self, f, partition):
        if partition is not None:
            try:
                f.seek(partition.first_sample_ptr)
            except TypeError:
                raise TypeError(f'Error parsing {self.input_file}: Confirm that the file includes valid sample data')


class _TopdownEventInfo:
    MULTIPLIER_EVENT = 'TOPDOWN.SLOTS:perf_metrics'
    PERF_METRICS = 'perf_metrics'
    TOPDOWN_SLOTS = 'TOPDOWN.SLOTS'
    NORMALIZED_TOPDOWN_EVENTS = ['PERF_METRICS.RETIRING', 'PERF_METRICS.BAD_SPECULATION',
                                 'PERF_METRICS.FRONTEND_BOUND', 'PERF_METRICS.BACKEND_BOUND']

    def __init__(self):
        self.__perf_metrics: List[str] = []
        self.__divisor_values: Union[np.ndarray, None] = None
        self.__multiplier_values: Union[np.ndarray, None] = None
        self.__skip_sample = False

    def set_info(self, events: Dict[str, _Line], timestamp: datetime):
        topdown_events = {k: e for k, e in events.items() if self.__is_topdown_event(e.name)}
        if not topdown_events:
            return
        event_values = {k: e.values for k, e in topdown_events.items()}
        self.__perf_metrics = list(filter(lambda e: e.lower().startswith(self.PERF_METRICS), topdown_events.keys()))
        self.__set_normalized_topdown_event_divisor_values(event_values, timestamp)
        self.__set_topdown_event_multiplier_values(events)

    def is_valid(self, event_name: str):
        if event_name in self.__perf_metrics and self.__divisor_values is not None and self.__multiplier_values is \
                not None and not self.__skip_sample:
            return True
        return False

    def get_updated_values(self, event_values: np.ndarray) -> np.ndarray:
        return (event_values / self.__divisor_values) * self.__multiplier_values

    def __set_topdown_event_multiplier_values(self, events: Dict[str, _Line]):
        if self.MULTIPLIER_EVENT in events:
            self.__multiplier_values = events[self.MULTIPLIER_EVENT].values

    def __set_normalized_topdown_event_divisor_values(self, values, timestamp: str):
        normalized_events = list(filter(lambda e: self.__is_normalizing_topdown_event(e), self.__perf_metrics))
        if self.__all_topdown_events_exist_in_sample(normalized_events):
            self.__divisor_values = reduce(np.add, [values[e] for e in normalized_events])
        else:
            missing_topdown_events = set(_TopdownEventInfo.NORMALIZED_TOPDOWN_EVENTS) - set(normalized_events)
            if normalized_events:
                logging.warning(f'Missing the following topdown event/s at timestamp {timestamp}:'
                                f' {missing_topdown_events} '
                                f'Skipping topdown event normalization. To avoid this warning, remove the following '
                                f'event/s at that timestamp from the input data file: {normalized_events}')
            self.__skip_sample = True

    def __all_topdown_events_exist_in_sample(self, normalized_events):
        return len(normalized_events) == len(_TopdownEventInfo.NORMALIZED_TOPDOWN_EVENTS)

    def __is_topdown_event(self, event_name: str) -> bool:
        return event_name.lower().startswith(self.PERF_METRICS) or event_name.startswith(self.TOPDOWN_SLOTS)

    @staticmethod
    def __is_normalizing_topdown_event(event_name: str) -> bool:
        return event_name in _TopdownEventInfo.NORMALIZED_TOPDOWN_EVENTS


class _Sample:
    """
    Store a sample from the input data file, extracting the relevant information as class attributes.

    Extract attributes from a sample in the raw file.  Each line with the same timestamp represents a sample.
    Within a sample there are various events collected.  These events are listed on their own lines and are stored
    in a Line/Event object.

    Each sample starts with a date line.
    """

    def __init__(self, perfmon_parser: DataParser, data, sample_number, block_number):
        self.timestamp = None
        self.timestamps = set()
        self.__perfmon_parser = perfmon_parser
        self.events: Dict[str, _Line] = {}
        self.sample_number = sample_number
        self.block_number = block_number
        ref_tsc = self.__perfmon_parser.system_info.ref_tsc
        self._line_class = perfmon_parser.line_class
        self._topdown_info = _TopdownEventInfo()

        for line in data:
            line_values = line.split(self.__perfmon_parser.attributes.value_separator)
            if self.__perfmon_parser.attributes.date_pattern.match(line):
                self.timestamp = self.__perfmon_parser.convert_to_datetime_from_line(line)
            if self.__line_contains_event_values(line_values):
                try:
                    event = self._line_class(line_values, self.__perfmon_parser.system_info.unique_os_processors,
                                             self.timestamp)
                    self.events[event.name] = event
                    self.timestamps.add(event.timestamp)
                except Exception as e:
                    # Invalid line - ignore and continue
                    # TODO: log error
                    print(str(e))

        if len(self.events) != 0:
            # If stats are empty, retrieve tsc_count for this block (same for all lines in a block)
            # Calculate the duration from the tsc_count
            self.tsc_count = next(iter(self.events.values())).tsc_count
            self.duration = self.tsc_count / ref_tsc
        else:
            # Else set the tsc_count and duration to 0
            self.tsc_count = 0
            self.duration = 0

        try:
            if self.timestamp:
                self._topdown_info.set_info(self.events, self.timestamp.strftime(
                    self.__perfmon_parser.attributes.date_format)[:-3])
        except ValueError as e:
            raise ValueError(f' {e}: Corrupt input data in sample {sample_number} at timestamp '
                             f'{self.timestamp.strftime(self.__perfmon_parser.attributes.date_format)[:-3]}. There '
                             f'is an unrecoverable error in the data file. Users can try deleting the corresponding sample.')

    def detect_and_update_topdown_event_values(self, event_name: str):
        """
        Adjust the values of the PERF_METRICS.* (TMA) events
        """
        if self._topdown_info.is_valid(event_name):
            self.events[event_name].values = self._topdown_info.get_updated_values(self.events[event_name].values)

    @staticmethod
    def __line_contains_event_values(line_values):
        return len(line_values) > 1


class _SampleTracker:
    """
    Utility class to assist in tracking and filtering samples
    """

    class _FilterMode(Enum):
        TIMESTAMP = auto()
        SAMPLE = auto()

    def __init__(self, from_sample, to_sample, from_timestamp, to_timestamp, partition):
        self.__current_sample = None
        self.__current_sample_number = 0
        self.__current_sample_timestamp = datetime(1970, 1, 1)
        self.__current_sample_ptr = None
        self.__first_sample_number_processed = 0
        self.__is_first_processed_sample_updated = False
        if partition:
            self.__init_sample_range(partition.first_sample, None, partition.last_sample, None)
            self.__current_sample_number = partition.first_sample - 1
            self.__current_sample_ptr = partition.first_sample_ptr
        else:
            self.__init_sample_range(from_sample, from_timestamp, to_sample, to_timestamp)

    @property
    def from_sample(self):
        return self.__from_sample if self.__mode == self._FilterMode.SAMPLE else self.__from_timestamp

    @property
    def to_sample(self):
        return self.__to_sample if self.__mode == self._FilterMode.SAMPLE else self.__to_timestamp

    @property
    def current_sample(self):
        return self.__current_sample_number if self.__mode == self._FilterMode.SAMPLE \
            else self.__current_sample_timestamp

    @property
    def current_sample_number(self):
        return self.__current_sample_number

    @property
    def current_sample_ptr(self):
        return self.__current_sample_ptr

    @current_sample_ptr.setter
    def current_sample_ptr(self, ptr: int):
        self.__current_sample_ptr = ptr

    @property
    def first_sample_number_processed(self):
        return self.__first_sample_number_processed

    @property
    def last_sample_number_processed(self):
        if self.is_current_sample_greater_than_range_max():
            return self.__current_sample_number - 1
        else:
            return self.__current_sample_number

    def process(self, sample_timestamp: datetime) -> None:
        self.__current_sample_number += 1
        self.__current_sample_timestamp = sample_timestamp
        if not self.__is_first_processed_sample_updated and self.is_current_sample_in_range():
            self.__first_sample_number_processed = self.__current_sample_number
            self.__is_first_processed_sample_updated = True

    def is_current_sample_in_range(self) -> bool:
        return self.from_sample <= self.current_sample <= self.to_sample

    def is_current_sample_greater_than_range_max(self) -> bool:
        return self.current_sample > self.to_sample

    def __init_sample_range(self, from_sample, from_timestamp, to_sample, to_timestamp):
        if from_timestamp or to_timestamp:
            self.__mode = self._FilterMode.TIMESTAMP
            self.__from_timestamp = from_timestamp if from_timestamp else datetime(1970, 1, 1)
            self.__to_timestamp = to_timestamp if to_timestamp else datetime.max
            self.__from_sample = None
            self.__to_sample = None
        else:
            self.__mode = self._FilterMode.SAMPLE
            self.__from_sample = from_sample if from_sample else 1
            self.__to_sample = to_sample if to_sample else sys.maxsize
            self.__from_timestamp = None
            self.__to_timestamp = None


class _ContentHandler:
    """
    Callback interface for the data parser.

    The order of events in this interface mirrors the order of the information in the input file
    """

    def __init__(self):
        """
        Receive notification of the beginning of the performance data
        (before reading the first event data in the file).

        The parser will invoke this method once, before any other methods in this interface
        """
        pass

    def end_file(self):
        """
        Receive notification of the end of the performance data
        (after reading the last line in the data file).

        The parser will invoke this method once, and it will be the last method invoked during the parse.
        """
        pass

    def end_sample(self, sample_number: int, block_number: int, sample_ptr: int):
        """
        Signals the end of a sample.

        The parser will invoke this method each time it encounters the parser's sample separator (e.g. '----------').

        :param sample_number: the sample number. First sample is 1.
        :param block_number: the block number. First block is 1.
        :param sample_ptr: the pointer at the end of the sample
        """
        pass

    def end_block(self, block_number: int):
        """
        Signals the end of a block.

        The parser will invoke this method each time it encounters the parser's block separator (e.g. '==========').

        :param block_number: the block number. First block is 1.
        """
        pass

    def end_event(self, data: str):
        """
        Signals the end of a single performance event data.

        The parser will invoke this method each time it encounters an event data line.

        @param data: event data.
        """

    def get_chunk_data(self) -> Optional[Any]:
        """
        Signals the end of a chunk, which contains 1 or more blocks.

        The parser will invoke this method after parsing a number of blocks that is less or equal to the
        specified chunk size. The content handler is expected to return the chunk data.

        :return: chunk data or None if there's no data to return.
                 Return type depends on the implementation. Content handlers decide which data to return.
        """
        pass


class _ChunkIterator:
    """
    Iterator that produces chunks of performance event data. Chunks are aligned to block boundaries.
    """

    def __init__(self,
                 parser: DataParser,
                 file: TextIO,
                 handler: _ContentHandler,
                 from_timestamp: datetime = None,
                 to_timestamp: datetime = None,
                 from_sample: int = None,
                 to_sample: int = None,
                 partition: Partition = None,
                 chunk_size=1):
        """
        Initialize the chunk iterator

        :param parser: the parser object that owns this chunk iterator.
        :param file: an open file handle of the data file to parse.
        :param handler: a content handler object that implements the performance data parser interface.

        See `DataParser.event_reader` for the description of all other arguments.
        """

        def verify_preconditions():
            if from_sample and from_timestamp:
                raise ValueError('The "from_sample" and "from_timestamp" arguments are mutually exclusive')
            if to_sample and to_timestamp:
                raise ValueError('The "to_sample" and "to_timestamp" arguments are mutually exclusive')
            if from_sample and to_timestamp:
                raise ValueError('Cannot use both sample numbers and timestamps to specify a sample range')
            if from_timestamp and to_sample:
                raise ValueError('Cannot use both sample numbers and timestamps to specify a sample range')
            if from_timestamp and to_timestamp and from_timestamp > to_timestamp:
                raise ValueError('The specified "from_timestamp" value must be less than or equal to the '
                                 'specified "to_timestamp" value')
            if from_sample is not None and from_sample <= 0:
                raise ValueError('The specified "from_sample" value must be greater than 0')
            if to_sample is not None and to_sample <= 0:
                raise ValueError('The specified "to_sample" value must be greater than 0')
            if from_sample is not None and to_sample is not None and from_sample > to_sample:
                raise ValueError('The specified "from_sample" value must be less than or equal to the '
                                 'specified "to_sample" value')
            if chunk_size < 0:
                raise ValueError('The specified "chunk_size" value must be greater than or equal to 0')

            if partition is not None and (from_sample is not None or from_timestamp is not None or
                                          to_sample is not None or to_timestamp is not None):
                raise ValueError('The "partition" parameter cannot be combined with any of the "from" or "to" '
                                 'parameters')

        verify_preconditions()
        self._next_block_number = 1
        self._file = file
        self._handler = handler
        self._chunk_size = chunk_size
        self._parser = parser
        self.sample_tracker = _SampleTracker(from_sample, to_sample, from_timestamp, to_timestamp, partition)
        self._end_of_file = False
        self._timestamp_detected = False
        self._skip_lines_until_first_sample()

    def __iter__(self):
        return self

    def __next__(self):
        """
        Return the next data chunk.
        The type and content of the chunk is determined by the content handler of the iterator.
        """
        # TODO: refactor this function (THORS-197)
        sample_detected = False
        start_sample = True
        if self._end_of_file:
            raise StopIteration()

        line_ptr = self._file.tell()
        offset = 0
        end_of_file_ptr = self._file.seek(offset, os.SEEK_END)
        self._file.seek(line_ptr)

        while line_ptr != end_of_file_ptr:
            line_ptr = self._file.tell()
            line = self._file.readline()
            line = line.strip()

            # Skip empty lines
            if not line:
                continue

            sample_detected = True

            if self._is_timestamp(line):
                self._timestamp_detected = True
            if start_sample and self._is_timestamp(line):
                sample_timestamp = self._parser.convert_to_datetime_from_line(line)
                self.sample_tracker.process(sample_timestamp)
                self.sample_tracker.current_sample_ptr = line_ptr
                start_sample = False
                # NOTE: If there is ever a need to notify content handlers on sample start,
                #       insert the notification here:
                #       `self._handler.start_sample(sample_timestamp)`

            # Find the first sample to process
            if not self.sample_tracker.is_current_sample_in_range():
                if self.sample_tracker.is_current_sample_greater_than_range_max():
                    # We passed the last sample to process, so terminate the iteration
                    # (same as reaching the end of the file)
                    break
                if self._is_separator(line):
                    start_sample = True
                continue
            if self._is_separator(line):
                self._handler.end_sample(self.sample_tracker.last_sample_number_processed, self._next_block_number,
                                         self.sample_tracker.current_sample_ptr)
                start_sample = True
                if self._is_block_separator(line):
                    current_block_number = self._next_block_number
                    self._next_block_number += 1
                    self._handler.end_block(current_block_number)
                    if self._chunk_size > 0 and (current_block_number % self._chunk_size == 0):
                        return self._handler.get_chunk_data()
            else:
                self._handler.end_event(line)

        # We reach here when we're done processing the file
        self._handler.end_sample(self.sample_tracker.last_sample_number_processed, self._next_block_number,
                                 self.sample_tracker.current_sample_ptr)
        self._handler.end_block(self._next_block_number)
        last_chunk = self._handler.get_chunk_data()
        self._handler.end_file()
        self._end_of_file = True
        if last_chunk is not None:
            return last_chunk
        elif sample_detected and not self._timestamp_detected:
            raise self._parser.timestamps_not_found_error()
        raise StopIteration()

    def _skip_lines_until_first_sample(self):
        line = self._file.readline()
        while line:
            line = line.strip()
            if line.startswith(self._parser.attributes.first_sample_indicator):
                return
            line = self._file.readline()
        raise ValueError(f"'{self._parser.attributes.first_sample_indicator}' line is missing from file")

    def _is_separator(self, line) -> bool:
        return line == self._parser.attributes.sample_separator or self._is_block_separator(line)

    def _is_block_separator(self, line) -> bool:
        return line == self._parser.attributes.block_separator

    def _is_timestamp(self, line: str) -> bool:
        return True if self._parser.attributes.date_pattern.match(line) else False


class _EventDataFrameBuilder:
    """
    Utility class for building a data frame from event values
    """

    def __init__(self, system_info: 'SystemInformationParserInterface'):
        self.system_info = system_info
        self.__data = []

    def append_event_values(self, sample: _Sample, event: _Line) -> None:
        unit_count = event.values.size
        core_devices = ['core'] + self.system_info.unique_core_types
        # TODO: change to factory and introduce polymorphism?
        if event.device.lower() in core_devices:
            self._append_core_event(sample, event, unit_count)
        elif event.device.lower() == 'thread':
            self._append_thread_event(sample, event, unit_count)
        elif self.__event_is_asymmetric(event):
            self._append_asymmetric_noncore_event(sample, event, unit_count, self.__get_pcie_id(event))
        else:
            self._append_symmetric_noncore_event(sample, event, unit_count)

    def append_pseudo_event_values(self, sample: _Sample) -> None:
        self._append_tsc(sample)
        self._append_sampling_time(sample)
        self._append_processed_samples(sample)

    def to_dataframe(self) -> pd.DataFrame:
        df = pd.DataFrame(data=self.__data, columns=rdc.COLUMNS)
        return df.dropna(subset=[rdc.VALUE])

    def _append_core_event(self, sample: _Sample, event: _Line, unit_count: int) -> None:
        # tuple parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp,
                                   self.system_info.socket_map[processor],
                                   self.system_info.core_type_map[processor],
                                   self.system_info.core_map[processor],
                                   self.system_info.thread_map[processor],
                                   processor,
                                   self.system_info.module_map[processor],
                                   self.system_info.die_map.get(processor, np.nan),
                                   sample.tsc_count,
                                   sample.block_number,
                                   event.name,
                                   event.values[index])
                                  for index, processor in enumerate(self.system_info.unique_os_processors) if index <
                                  unit_count)))

    def _append_thread_event(self, sample: _Sample, event: _Line, unit_count: int) -> None:
        # tuple parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp,
                                   self.system_info.socket_map[processor],
                                   event.device + '_' + self.system_info.core_type_map[processor],
                                   self.system_info.core_map[processor],
                                   self.system_info.thread_map[processor],
                                   processor,
                                   self.system_info.module_map[processor],
                                   self.system_info.die_map.get(processor, np.nan),
                                   sample.tsc_count,
                                   sample.block_number,
                                   event.name,
                                   event.values[index])
                                  for index, processor in enumerate(self.system_info.unique_os_processors) if index <
                                  unit_count)))

    def _append_asymmetric_noncore_event(self, sample: _Sample, event: _Line, unit_count: int, pcie_id: str) -> None:
        socket_map = self.system_info.pcie_units[pcie_id]
        # tuple parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp, socket_map[index],
                                   event.device, None, None, index, None, None, sample.tsc_count,
                                   sample.block_number, event.name, event.values[index])
                                  for index in range(unit_count))))

    def _append_symmetric_noncore_event(self, sample: _Sample, event: _Line, unit_count: int) -> None:
        socket_count = np.unique(list(self.system_info.socket_map.values())).size
        units_per_socket = unit_count // socket_count
        # tuple parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp, index // units_per_socket,
                                   event.device, None, None, index % units_per_socket, None, None, sample.tsc_count,
                                   sample.block_number, event.name, event.values[index])
                                  for index in range(unit_count))))

    def _append_tsc(self, sample: _Sample) -> None:
        # Inject the TSC value as a set of TSC core events. Some metric formulas need TSC as a core event
        # tuple parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp, self.system_info.socket_map[processor],
                                   self.system_info.core_type_map[processor],
                                   self.system_info.core_map[processor], self.system_info.thread_map[processor],
                                   processor, self.system_info.module_map[processor],
                                   self.system_info.die_map.get(processor, np.nan),
                                   sample.tsc_count, sample.block_number,
                                   'TSC', sample.tsc_count)
                                  for index, processor in enumerate(self.system_info.unique_os_processors))))

    def _append_sampling_time(self, sample: _Sample) -> None:
        # append parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp, self.system_info.socket_map[processor],
                                   self.system_info.core_type_map[processor],
                                   self.system_info.core_map[processor], self.system_info.thread_map[processor],
                                   processor, self.system_info.module_map[processor],
                                   self.system_info.die_map.get(processor, np.nan),
                                   sample.tsc_count, sample.block_number,
                                   '$samplingTime', sample.duration)
                                  for index, processor in enumerate(self.system_info.unique_os_processors))))

    def _append_processed_samples(self, sample: _Sample) -> None:
        # append parameter order follows RawDataFrameColumns. All entries must be accounted for
        self.__data.extend(tuple(((sample.timestamp, self.system_info.socket_map[processor],
                                   self.system_info.core_type_map[processor],
                                   self.system_info.core_map[processor], self.system_info.thread_map[processor],
                                   processor, self.system_info.module_map[processor],
                                   self.system_info.die_map.get(processor, np.nan),
                                   sample.tsc_count, sample.block_number,
                                   '$processed_samples', 1)
                                  for index, processor in enumerate(self.system_info.unique_os_processors))))

    def __event_is_asymmetric(self, event):
        return '_' in event.device and self.__get_pcie_id(event) in self.system_info.pcie_units.keys()

    @staticmethod
    def __get_pcie_id(event):
        return event.device.split('_')[-1].upper()


class _TimeStampsNotFoundError(Exception):
    def __init__(self, message="No samples labeled with timestamps found. Ensure your collector has a timestamp next "
                               "to each sample or event."):
        super().__init__(message)


class _ExcludedEventTracker:
    @dataclass
    class Info:
        exclusion_count: int
        first_timestamp: datetime

    def __init__(self):
        self.__exclusion_map: Dict[str, _ExcludedEventTracker.Info] = {}

    def add_exclusion(self, timestamp: datetime, event_name: str):
        """
        Checks whether or not event should be added to the exclusion map
        @param timestamp: timestamp of the sample
        @param event: _Line object being evaluated
        @return:
        """
        if event_name not in self.__exclusion_map:
            self.__exclusion_map[event_name] = _ExcludedEventTracker.Info(1, timestamp)
        else:
            self.__exclusion_map[event_name].exclusion_count += 1

    def report_excluded_events(self):
        if self.__exclusion_map:
            print()
            for name, info in self.__exclusion_map.items():
                first_timestamp = info.first_timestamp.strftime("%m/%d/%Y %H:%M:%S.%f")[:-3]
                print(f'{name} excluded {info.exclusion_count} times due to excessively large counts '
                      f'(first exclusion at {first_timestamp})')
                sys.stdout.flush()

    def is_invalid_event(self, event: _Line):
        max_sample_multiplier = 4096
        max_value = event.tsc_count * max_sample_multiplier
        return np.any(event.values > max_value) and not self.__is_static_register(event.name)

    @staticmethod
    def __is_static_register(event_name: str):
        static_register_names = ['REG_STATIC_', 'type=STATIC:']
        return any(name in event_name for name in static_register_names)
