From 612c895ec20023d08381426bed8239ddeea95e41 Mon Sep 17 00:00:00 2001
From: Hetang Modi <hemodi@splunk.com>
Date: Wed, 16 Feb 2022 20:38:24 +0530
Subject: [PATCH 1/2] ci(semgrep): fix semgrep and lint failures

---
 .semgrepignore                                | 13 +++++++++++
 cloudconnectlib/common/util.py                |  2 +-
 cloudconnectlib/configuration/loader.py       |  7 ++++--
 cloudconnectlib/core/engine.py                | 10 ++++++--
 cloudconnectlib/core/http.py                  |  5 +++-
 cloudconnectlib/core/plugin.py                |  6 ++++-
 cloudconnectlib/core/task.py                  | 18 +++++++++++----
 cloudconnectlib/core/template.py              |  5 +++-
 .../splunktacollectorlib/common/__init__.py   |  2 +-
 .../splunktacollectorlib/config.py            | 23 +++++++++++++------
 .../data_collection/ta_data_collector.py      | 13 +++++++----
 .../data_collection/ta_data_loader.py         |  2 +-
 .../data_collection/ta_helper.py              | 10 +++++---
 .../data_collection/ta_mod_input.py           | 14 ++++++-----
 .../splunk_ta_import_declare.py               |  2 +-
 15 files changed, 97 insertions(+), 35 deletions(-)
 create mode 100644 .semgrepignore

diff --git a/.semgrepignore b/.semgrepignore
new file mode 100644
index 0000000..85c66b9
--- /dev/null
+++ b/.semgrepignore
@@ -0,0 +1,13 @@
+# Ignore git items
+.gitignore
+.git/
+:include .gitignore
+
+# Test cases, examples and workflows
+tests/
+.github/
+examples/
+tool/
+
+# For removed files
+deprecated_files/
\ No newline at end of file
diff --git a/cloudconnectlib/common/util.py b/cloudconnectlib/common/util.py
index 1e188ae..e1f9534 100755
--- a/cloudconnectlib/common/util.py
+++ b/cloudconnectlib/common/util.py
@@ -48,7 +48,7 @@ def load_json_file(file_path):
     :param file_path: JSON file path.
     :return: A `dict` object.
     """
-    with open(file_path) as file_pointer:
+    with open(file_path, encoding="UTF-8") as file_pointer:
         return json.load(file_pointer)
 
 
diff --git a/cloudconnectlib/configuration/loader.py b/cloudconnectlib/configuration/loader.py
index c9ac7a6..6ea3230 100755
--- a/cloudconnectlib/configuration/loader.py
+++ b/cloudconnectlib/configuration/loader.py
@@ -64,7 +64,7 @@ def _get_schema_from_file(schema_file):
         """
         try:
             return load_json_file(schema_file)
-        except:
+        except Exception:
             raise ConfigException(
                 "Cannot load schema from file {}: {}".format(
                     schema_file, traceback.format_exc()
@@ -236,7 +236,10 @@ def _load_processor(self, processor):
         pipeline = self._parse_tasks(processor.get("pipeline", []))
         return Processor(skip_conditions=skip_conditions, pipeline=pipeline)
 
-    def _load_request(self, request):
+    # nosemgrep reason - false positive, `request` is a dict here
+    def _load_request(  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
+        self, request
+    ):
         options = self._load_options(request["request"])
 
         pre_process = self._load_processor(request.get("pre_process", {}))
diff --git a/cloudconnectlib/core/engine.py b/cloudconnectlib/core/engine.py
index 4197451..21eaf38 100755
--- a/cloudconnectlib/core/engine.py
+++ b/cloudconnectlib/core/engine.py
@@ -92,7 +92,10 @@ class Job:
     reached it's stop condition.
     """
 
-    def __init__(self, request, context, checkpoint_mgr, proxy=None):
+    # nosemgrep reason - false positive, `request` is a Munch object which has settings for making a request
+    def __init__(  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
+        self, request, context, checkpoint_mgr, proxy=None
+    ):
         """
         Constructs a `Job` with properties request, context and a
          optional proxy setting.
@@ -301,7 +304,10 @@ def _run(self):
                 _logger.info("Stop condition reached, exit job now")
                 break
 
-    def _send_request(self, request):
+    # nosemgrep reason - false positive, `request` is a dict here
+    def _send_request(  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
+        self, request
+    ):
         """Do send request with a simple error handling strategy. Refer to
         https://confluence.splunk.com/display/PROD/CC+1.0+-+Detail+Design"""
         try:
diff --git a/cloudconnectlib/core/http.py b/cloudconnectlib/core/http.py
index 0bcbd07..848353f 100755
--- a/cloudconnectlib/core/http.py
+++ b/cloudconnectlib/core/http.py
@@ -288,7 +288,10 @@ def _initialize_connection(self):
             _logger.info("Proxy is not enabled for http connection.")
         self._connection = self._build_http_connection(self._proxy_info)
 
-    def send(self, request):
+    # nosemgrep reason - false positive, `request` is a dict here having configs required for making request
+    def send(  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
+        self, request
+    ):
         if not request:
             raise ValueError("The request is none")
         if request.body and not isinstance(request.body, str):
diff --git a/cloudconnectlib/core/plugin.py b/cloudconnectlib/core/plugin.py
index c40451f..6b91d8b 100644
--- a/cloudconnectlib/core/plugin.py
+++ b/cloudconnectlib/core/plugin.py
@@ -86,7 +86,11 @@ def import_plugin_file(file_name):
         return
 
     try:
-        importlib.import_module(module_name)
+        # nosemgrep reason - this function only imports a custom cce_plugin_<addon's plugin>.py,
+        # which is provided by an addon and programmatically provided.
+        importlib.import_module(  # nosemgrep: python.lang.security.audit.non-literal-import.non-literal-import
+            module_name
+        )
     except Exception:
         logger.warning(f"Failed to load module {module_name}, {traceback.format_exc()}")
         return
diff --git a/cloudconnectlib/core/task.py b/cloudconnectlib/core/task.py
index e8246eb..8e81349 100644
--- a/cloudconnectlib/core/task.py
+++ b/cloudconnectlib/core/task.py
@@ -88,7 +88,10 @@ def render(self, context):
 
 
 class RequestTemplate:
-    def __init__(self, request):
+    # nosemgrep reason - false positive, `request` is a dict here having configs required for making request
+    def __init__(  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
+        self, request
+    ):
         if not request:
             raise ValueError("The request is none")
         url = request.get("url")
@@ -295,7 +298,8 @@ def perform(self, context):
         try:
             invoke_results = self._process_handler.execute(context)
         except Exception:
-            logger.exception("Task=%s encountered exception", self)
+            # Fixing `python.lang.best-practice.logging-error-without-handling.logging-error-without-handling`
+            logger.warn("Task=%s encountered exception", self)
             raise CCESplitError
         if not invoke_results or not invoke_results.get(CCESplitTask.OUTPUT_KEY):
             raise CCESplitError
@@ -316,7 +320,10 @@ class CCEHTTPRequestTask(BaseTask):
      from context when executing.
     """
 
-    def __init__(self, request, name, meta_config=None, task_config=None, **kwargs):
+    # nosemgrep reason - false positive, `request` is a dict here having configs required for making request
+    def __init__(
+        self, request, name, meta_config=None, task_config=None, **kwargs
+    ):  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
         """
         :param verify: Absolute path to server certificate, otherwise uses
             requests' default certificate to verify server's TLS certificate.
@@ -464,7 +471,10 @@ def _should_exit(self, done_count, context):
             return True
         return False
 
-    def _send_request(self, request):
+    # nosemgrep reason - false positive, `request` is a dict here having configs required for making request
+    def _send_request(
+        self, request
+    ):  # nosemgrep: python.django.security.audit.django-ratelimit.missing-ratelimit.missing-ratelimit
         try:
             response = self._http_client.send(request)
         except HTTPError as error:
diff --git a/cloudconnectlib/core/template.py b/cloudconnectlib/core/template.py
index fe8c1d4..90200aa 100755
--- a/cloudconnectlib/core/template.py
+++ b/cloudconnectlib/core/template.py
@@ -31,6 +31,9 @@ def translate_internal(context):
         if match:
             context_var = context.get(match.groups()[0])
             return context_var if context_var else ""
-        return _template.render(context)
+        # nosemgrep reason - the `context` passed is provided by addon, and it should sanitized by the addon.
+        return _template.render(  # nosemgrep: python.flask.security.xss.audit.direct-use-of-jinja2.direct-use-of-jinja2  # noqa: E501 - semgrep name is too long
+            context
+        )
 
     return translate_internal
diff --git a/cloudconnectlib/splunktacollectorlib/common/__init__.py b/cloudconnectlib/splunktacollectorlib/common/__init__.py
index 4c58c45..d5634b5 100755
--- a/cloudconnectlib/splunktacollectorlib/common/__init__.py
+++ b/cloudconnectlib/splunktacollectorlib/common/__init__.py
@@ -22,7 +22,7 @@ def load_schema_file(schema_file):
     Load schema file.
     """
 
-    with open(schema_file) as f:
+    with open(schema_file, encoding="UTF-8") as f:
         ret = json.load(f)
 
     common = ret.get("_common_", dict())
diff --git a/cloudconnectlib/splunktacollectorlib/config.py b/cloudconnectlib/splunktacollectorlib/config.py
index a88c75c..bce6ba0 100755
--- a/cloudconnectlib/splunktacollectorlib/config.py
+++ b/cloudconnectlib/splunktacollectorlib/config.py
@@ -149,8 +149,10 @@ def load(self):
                 else:
                     break
             else:
-                log(exc, level=logging.ERROR, need_tb=True)
-                raise exc
+                # F821 reason - when `ConfigException` would be caught, `exc` would be defined.
+                # The uncaught exceptions would break the code flow.
+                log(exc, level=logging.ERROR, need_tb=True)  # noqa: F821
+                raise exc  # noqa: F821
 
         log('"load" method out', level=logging.DEBUG)
         return ret
@@ -185,7 +187,7 @@ def update_items(
             level=logging.DEBUG,
         )
 
-        assert (
+        assert (  # nosemgrep: gitlab.bandit.B101 - additional check for endpoint_id. Raises AssertionError otherwise
             endpoint_id in self._endpoints
         ), "Unexpected endpoint id in given schema - {ep_id}" "".format(
             ep_id=endpoint_id
@@ -314,8 +316,11 @@ def _parse_schema(self, ucc_config_schema):
             }
         )
         for field in Config.META_FIELDS:
-            assert field in ucc_config_schema and isinstance(
-                ucc_config_schema[field], str
+            assert (  # nosemgrep: gitlab.bandit.B101 - additional check for ucc_config_schema.
+                field in ucc_config_schema
+                and isinstance(
+                    ucc_config_schema[field], str
+                )
             ), ('Missing or invalid field "%s" in given schema' % field)
             setattr(self, field, ucc_config_schema[field])
 
@@ -324,10 +329,14 @@ def _parse_schema(self, ucc_config_schema):
             if key.startswith("_"):
                 continue
 
-            assert isinstance(val, dict), (
+            assert isinstance(  # nosemgrep: gitlab.bandit.B101 - additional check `val` type.
+                val, dict
+            ), (
                 'The schema of endpoint "%s" should be dict' % key
             )
-            assert "endpoint" in val, 'The endpoint "%s" has no endpoint entry' % key
+            assert "endpoint" in val, (  # nosemgrep: gitlab.bandit.B101 - additional check for endpoint in `val`.
+                'The endpoint "%s" has no endpoint entry' % key
+            )
 
             self._endpoints[key] = val
 
diff --git a/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_collector.py b/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_collector.py
index 7d323d6..5e843d3 100755
--- a/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_collector.py
+++ b/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_collector.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import threading
-import time
 from collections import namedtuple
 
 from splunktalib.common import util as scu
@@ -110,8 +109,12 @@ def _build_event(self, events):
             events = [events]
         evts = []
         for event in events:
-            assert event.raw_data, "the raw data of events is empty"
-            if event.is_unbroken:
+            assert (  # nosemgrep: gitlab.bandit.B101 - additional check for raw data in a single event.
+                event.raw_data
+            ), "the raw data of events is empty"
+            if (
+                event.is_unbroken  # nosemgrep: python.lang.maintainability.is-function-without-parentheses.is-function-without-parentheses - false positive, it is a property and not a function.  # noqa: E501 - semgrep name is too long
+            ):
                 evt = unbroken_evt_fmt.format(
                     event.host or "",
                     event.source or "",
@@ -119,7 +122,9 @@ def _build_event(self, events):
                     event.time or "",
                     event.index or "",
                     scu.escape_cdata(event.raw_data),
-                    "<done/>" if event.is_done else "",
+                    "<done/>"
+                    if event.is_done  # nosemgrep: python.lang.maintainability.is-function-without-parentheses.is-function-without-parentheses - false positive, it is a property and not a function.  # noqa: E501 - semgrep name is too long
+                    else "",
                 )
             else:
                 evt = evt_fmt.format(
diff --git a/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_loader.py b/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_loader.py
index 3ad8d5d..dee5f7d 100755
--- a/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_loader.py
+++ b/cloudconnectlib/splunktacollectorlib/data_collection/ta_data_loader.py
@@ -86,7 +86,7 @@ def _enqueue_io_job(job):
     def _wait_for_tear_down(self):
         wakeup_q = self._wakeup_queue
         while 1:
-            try:
+            try:  # nosemgrep: gitlab.bandit.B110 -- nothing to do when queue is empty.
                 go_exit = wakeup_q.get(timeout=1)
             except queue.Empty:
                 pass
diff --git a/cloudconnectlib/splunktacollectorlib/data_collection/ta_helper.py b/cloudconnectlib/splunktacollectorlib/data_collection/ta_helper.py
index bb9ddfa..0068a96 100755
--- a/cloudconnectlib/splunktacollectorlib/data_collection/ta_helper.py
+++ b/cloudconnectlib/splunktacollectorlib/data_collection/ta_helper.py
@@ -54,7 +54,9 @@ def get_md5(data):
     :param data:
     :return:
     """
-    assert data is not None, "The input cannot be None"
+    assert (  # nosemgrep: gitlab.bandit.B101 - checking for data to not be None
+        data is not None
+    ), "The input cannot be None"
     if isinstance(data, str):
         return hashlib.sha256(data.encode("utf-8")).hexdigest()
     elif isinstance(data, (list, tuple, dict)):
@@ -124,10 +126,12 @@ def _load_conf_contents(self):
 
     def _process_division(self, division_endpoint, division_contents):
         division_metrics = []
-        assert isinstance(division_contents, dict)
+        assert isinstance(  # nosemgrep: gitlab.bandit.B101 - checking for type of `division_contents`
+            division_contents, dict
+        )
         for division_key, division_value in division_contents.items():
             try:
-                assert (
+                assert (  # nosemgrep: gitlab.bandit.B101
                     self.TYPE in division_value
                     and division_value[self.TYPE] in [self.TYPE_SINGLE, self.TYPE_MULTI]
                     and self.SEPARATOR in division_value
diff --git a/cloudconnectlib/splunktacollectorlib/data_collection/ta_mod_input.py b/cloudconnectlib/splunktacollectorlib/data_collection/ta_mod_input.py
index 654d7b3..c14d20c 100755
--- a/cloudconnectlib/splunktacollectorlib/data_collection/ta_mod_input.py
+++ b/cloudconnectlib/splunktacollectorlib/data_collection/ta_mod_input.py
@@ -35,7 +35,7 @@
 from . import ta_checkpoint_manager as cpmgr
 from . import ta_config as tc
 from . import ta_data_client as tdc
-from . import ta_data_loader as dl
+from . import ta_data_loader as tdl
 
 utils.remove_http_proxy_env_vars()
 
@@ -171,7 +171,7 @@ def run(
     # http://bugs.python.org/issue7980
     time.strptime("2016-01-01", "%Y-%m-%d")
 
-    loader = dl.create_data_loader()
+    loader = tdl.create_data_loader()
 
     # handle signal
     _setup_signal_handler(loader, ta_short_name)
@@ -272,8 +272,12 @@ def main(
     """
     Main entry point
     """
-    assert collector_cls, "ucc modinput collector is None."
-    assert schema_file_path, "ucc modinput schema file is None"
+    assert (  # nosemgrep: gitlab.bandit.B101 - check for required params to be passed
+        collector_cls
+    ), "ucc modinput collector is None."
+    assert (  # nosemgrep: gitlab.bandit.B101 - check for required params to be passed
+        schema_file_path
+    ), "ucc modinput schema file is None"
 
     settings = ld(schema_file_path)
 
@@ -289,8 +293,6 @@ def main(
             )
         elif args[1] == "--validate-arguments":
             sys.exit(validate_config())
-        elif args[1] in ("-h", "--h", "--help"):
-            usage()
         else:
             usage()
     else:
diff --git a/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py b/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py
index 9aa813c..e6c2703 100755
--- a/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py
+++ b/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py
@@ -23,7 +23,7 @@
 
 ta_name = os.path.basename(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 ta_lib_name = re.sub(r"[^\w]+", "_", ta_name.lower())
-assert ta_name or ta_name == "package", "TA name is None or package"
+assert ta_name or ta_name == "package", "TA name is None or package"  # nosemgrep: gitlab.bandit.B101 - check for `ta_name` to be a valid value
 pattern = re.compile(r"[\\/]etc[\\/]apps[\\/][^\\/]+[\\/]bin[\\/]?$")
 new_paths = [path for path in sys.path if not pattern.search(path) or ta_name in path]
 new_paths.insert(0, os.path.sep.join([os.path.dirname(__file__), ta_lib_name]))

From fdd3068ac029ada84e9feb00716c69a022a2ae81 Mon Sep 17 00:00:00 2001
From: Hetang Modi <hemodi@splunk.com>
Date: Wed, 16 Feb 2022 20:47:56 +0530
Subject: [PATCH 2/2] chore(lint): fixing pre-commit failures

---
 cloudconnectlib/splunktacollectorlib/config.py              | 6 +++++-
 .../splunktacollectorlib/splunk_ta_import_declare.py        | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cloudconnectlib/splunktacollectorlib/config.py b/cloudconnectlib/splunktacollectorlib/config.py
index bce6ba0..96c6184 100755
--- a/cloudconnectlib/splunktacollectorlib/config.py
+++ b/cloudconnectlib/splunktacollectorlib/config.py
@@ -316,12 +316,14 @@ def _parse_schema(self, ucc_config_schema):
             }
         )
         for field in Config.META_FIELDS:
+            # fmt: off
             assert (  # nosemgrep: gitlab.bandit.B101 - additional check for ucc_config_schema.
                 field in ucc_config_schema
                 and isinstance(
                     ucc_config_schema[field], str
                 )
             ), ('Missing or invalid field "%s" in given schema' % field)
+            # fmt: on
             setattr(self, field, ucc_config_schema[field])
 
         self._endpoints = {}
@@ -334,7 +336,9 @@ def _parse_schema(self, ucc_config_schema):
             ), (
                 'The schema of endpoint "%s" should be dict' % key
             )
-            assert "endpoint" in val, (  # nosemgrep: gitlab.bandit.B101 - additional check for endpoint in `val`.
+            assert (  # nosemgrep: gitlab.bandit.B101 - additional check for endpoint in `val`.
+                "endpoint" in val
+            ), (
                 'The endpoint "%s" has no endpoint entry' % key
             )
 
diff --git a/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py b/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py
index e6c2703..7735073 100755
--- a/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py
+++ b/cloudconnectlib/splunktacollectorlib/splunk_ta_import_declare.py
@@ -23,7 +23,9 @@
 
 ta_name = os.path.basename(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 ta_lib_name = re.sub(r"[^\w]+", "_", ta_name.lower())
-assert ta_name or ta_name == "package", "TA name is None or package"  # nosemgrep: gitlab.bandit.B101 - check for `ta_name` to be a valid value
+assert (  # nosemgrep: gitlab.bandit.B101 - check for `ta_name` to be a valid value
+    ta_name or ta_name == "package"
+), "TA name is None or package"
 pattern = re.compile(r"[\\/]etc[\\/]apps[\\/][^\\/]+[\\/]bin[\\/]?$")
 new_paths = [path for path in sys.path if not pattern.search(path) or ta_name in path]
 new_paths.insert(0, os.path.sep.join([os.path.dirname(__file__), ta_lib_name]))