metadata: early terminate certain fields to avoid over extraction

This CL adds a "early terminate the field based on field value" parser
mechanism to end the field as soon as the field value provides an
unambiguous answer to the question we care about.

This is to prevent over-extraction over certain fields (specifically,
local modifications) which can either be a definitive answer (e.g. No
modification) or multi-line free-form texts (which may contain unknown
fields that we don't care about at this stage).

This mitigates over extraction of README.chromium files like:

```
Local Modifications:
None

How to Uprev:
Steps...
```

Where the old parser would extract "None\n\nHow to Uprev:\nSteps..."

This CL also refactors single line fields to use the same early
termination mechanism since single line field simply ends as soon as
the line is parsed.

Union[Something, None] is changed to Optional[Something] based on
styleguide.

Bug: b/324149233
Change-Id: I3fca80eaceb071263f8ae8730afda230fff0bbb0
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/5394917
Reviewed-by: Anne Redulla <aredulla@google.com>
Commit-Queue: Jiewei Qian <qjw@chromium.org>
This commit is contained in:
Jiewei Qian
2024-03-26 22:42:03 +00:00
committed by LUCI CQ
parent 37e7482365
commit 79cfa048c0
15 changed files with 230 additions and 76 deletions

View File

@@ -6,7 +6,7 @@
import os
import re
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -63,12 +63,12 @@ def is_formatted_string_cpe(value: str) -> bool:
return util.matches(_PATTERN_CPE_FORMATTED_STRING, value)
class CPEPrefixField(field_types.MetadataField):
class CPEPrefixField(field_types.SingleLineTextField):
"""Custom field for the package's CPE."""
def __init__(self):
super().__init__(name="CPEPrefix", one_liner=True)
super().__init__(name="CPEPrefix")
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value is either 'unknown', or conforms to
either the CPE 2.3 or 2.2 format.
"""

View File

@@ -6,7 +6,7 @@
import datetime
import os
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -68,12 +68,12 @@ def format_matches(value: str, date_format: str):
return True
class DateField(field_types.MetadataField):
class DateField(field_types.SingleLineTextField):
"""Custom field for the date when the package was updated."""
def __init__(self):
super().__init__(name="Date", one_liner=True)
super().__init__(name="Date")
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value is a YYYY-MM-DD date."""
value = value.strip()
if not value:

View File

@@ -6,7 +6,7 @@
import os
import re
import sys
from typing import List, Union, Tuple
from typing import List, Tuple, Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -95,15 +95,15 @@ def is_license_allowlisted(value: str) -> bool:
return util.matches(_PATTERN_LICENSE_ALLOWED, value)
class LicenseField(field_types.MetadataField):
class LicenseField(field_types.SingleLineTextField):
"""Custom field for the package's license type(s).
e.g. Apache 2.0, MIT, BSD, Public Domain.
"""
def __init__(self):
super().__init__(name="License", one_liner=False)
super().__init__(name="License")
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value consists of recognized license types.
Note: this field supports multiple values.

View File

@@ -6,7 +6,7 @@
import os
import re
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -26,12 +26,12 @@ _PATTERN_PATH_BACKWARD = re.compile(r"\.\.\/")
_NOT_SHIPPED = "NOT_SHIPPED"
class LicenseFileField(field_types.MetadataField):
class LicenseFileField(field_types.SingleLineTextField):
"""Custom field for the paths to the package's license file(s)."""
def __init__(self):
super().__init__(name="License File", one_liner=True)
super().__init__(name="License File")
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value consists of non-empty paths with no
backward directory navigation (i.e. no "../").
@@ -72,7 +72,7 @@ class LicenseFileField(field_types.MetadataField):
value: str,
source_file_dir: str,
repo_root_dir: str,
) -> Union[vr.ValidationResult, None]:
) -> Optional[vr.ValidationResult]:
"""Checks the given value consists of file paths which exist on
disk.

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env python3
# Copyright 2023 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import os
import re
import sys
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
_ROOT_DIR = os.path.abspath(os.path.join(_THIS_DIR, "..", "..", ".."))
# Add the repo's root directory for clearer imports.
sys.path.insert(0, _ROOT_DIR)
import metadata.fields.field_types as field_types
import metadata.fields.util as util
import metadata.validation_result as vr
_PATTERNS_NOT_MODIFIED = [
# "None" and its variants, like "(none)", "none."
re.compile(r"^\(?none\.?\)?\.?$", re.IGNORECASE),
# "No modification" or "No modifications".
re.compile(r"^no modifications?\.?$", re.IGNORECASE),
# "N/A" and its variants, like "NA", "N/A", "N/A.".
re.compile(r"^(N ?\/ ?A)\.?|na\.?$", re.IGNORECASE),
# "Not applicable".
re.compile(r"^not applicable\.?$", re.IGNORECASE),
]
class LocalModificationsField(field_types.FreeformTextField):
"""Custom field for local modification."""
def __init__(self):
super().__init__(name="Local Modifications", structured=False)
def should_terminate_field(self, field_value) -> bool:
field_value = field_value.strip()
# If we can reasonably infer the field value means "No modification",
# terminate this field to avoid over extraction.
for pattern in _PATTERNS_NOT_MODIFIED:
if pattern.match(field_value):
return True
return False

View File

@@ -6,7 +6,7 @@
import os
import re
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -27,9 +27,9 @@ _PATTERN_URL_CANONICAL_REPO = re.compile(
class URLField(field_types.MetadataField):
"""Custom field for the package URL(s)."""
def __init__(self):
super().__init__(name="URL", one_liner=False)
super().__init__(name="URL")
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value has acceptable URL values only.
Note: this field supports multiple values.

View File

@@ -6,7 +6,7 @@
import os
import re
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -28,12 +28,12 @@ def is_unknown(value: str) -> bool:
or util.is_unknown(value))
class VersionField(field_types.MetadataField):
class VersionField(field_types.SingleLineTextField):
"""Custom field for the package version."""
def __init__(self):
super().__init__(name="Version", one_liner=True)
super().__init__(name="Version")
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value is acceptable - there must be at least
one non-whitespace character, and "N/A" is preferred over "0" if
the version is unknown.

View File

@@ -6,7 +6,7 @@
import os
import re
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -33,12 +33,8 @@ class MetadataField:
# The delimiter used to separate multiple values.
VALUE_DELIMITER = ","
def __init__(self,
name: str,
one_liner: bool = True,
structured: bool = True):
def __init__(self, name: str, structured: bool = True):
self._name = name
self._one_liner = one_liner
self._structured = structured
def __eq__(self, other):
@@ -53,8 +49,10 @@ class MetadataField:
def get_name(self):
return self._name
def is_one_liner(self):
return self._one_liner
def should_terminate_field(self, field_value) -> bool:
"""Whether this field should be terminated based on the given `field_value`.
"""
return False
def is_structured(self):
"""Whether the field represents structured data, such as a list of
@@ -67,7 +65,7 @@ class MetadataField:
"""
return self._structured
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value is acceptable for the field.
Raises: NotImplementedError if called. This method must be
@@ -78,7 +76,8 @@ class MetadataField:
class FreeformTextField(MetadataField):
"""Field where the value is freeform text."""
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value has at least one non-whitespace
character.
"""
@@ -88,12 +87,27 @@ class FreeformTextField(MetadataField):
return None
class YesNoField(MetadataField):
class SingleLineTextField(FreeformTextField):
"""Field where the field as a whole is a single line of text."""
def __init__(self, name):
super().__init__(name=name)
def should_terminate_field(self, field_value) -> bool:
# Look for line breaks.
#
# We don't use `os.linesep`` here because Chromium uses line feed
# (`\n`) for git checkouts on all platforms (see README.* entries
# in chromium.src/.gitattributes).
return field_value.endswith("\n")
class YesNoField(SingleLineTextField):
"""Field where the value must be yes or no."""
def __init__(self, name: str):
super().__init__(name=name, one_liner=True)
super().__init__(name=name)
def validate(self, value: str) -> Union[vr.ValidationResult, None]:
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value is either yes or no."""
if util.matches(_PATTERN_YES_OR_NO, value):
return None

View File

@@ -5,7 +5,7 @@
import os
import sys
from typing import Union
from typing import Optional
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
@@ -18,20 +18,16 @@ import metadata.fields.custom.cpe_prefix
import metadata.fields.custom.date
import metadata.fields.custom.license
import metadata.fields.custom.license_file
import metadata.fields.custom.local_modifications
import metadata.fields.custom.url
import metadata.fields.custom.version
import metadata.fields.field_types as field_types
# Freeform text fields.
NAME = field_types.FreeformTextField("Name")
SHORT_NAME = field_types.FreeformTextField("Short Name")
REVISION = field_types.FreeformTextField("Revision")
DESCRIPTION = field_types.FreeformTextField("Description",
one_liner=False,
structured=False)
LOCAL_MODIFICATIONS = field_types.FreeformTextField("Local Modifications",
one_liner=False,
structured=False)
NAME = field_types.SingleLineTextField("Name")
SHORT_NAME = field_types.SingleLineTextField("Short Name")
REVISION = field_types.SingleLineTextField("Revision")
DESCRIPTION = field_types.FreeformTextField("Description", structured=False)
# Yes/no fields.
SECURITY_CRITICAL = field_types.YesNoField("Security Critical")
@@ -47,6 +43,8 @@ LICENSE = metadata.fields.custom.license.LicenseField()
LICENSE_FILE = metadata.fields.custom.license_file.LicenseFileField()
URL = metadata.fields.custom.url.URLField()
VERSION = metadata.fields.custom.version.VersionField()
LOCAL_MODIFICATIONS = metadata.fields.custom.local_modifications.LocalModificationsField(
)
ALL_FIELDS = (
NAME,
@@ -69,5 +67,5 @@ ALL_FIELD_NAMES = {field.get_name() for field in ALL_FIELDS}
_FIELD_MAPPING = {field.get_name().lower(): field for field in ALL_FIELDS}
def get_field(label: str) -> Union[field_types.MetadataField, None]:
def get_field(label: str) -> Optional[field_types.MetadataField]:
return _FIELD_MAPPING.get(label.lower())

View File

@@ -49,10 +49,17 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
"""
dependencies = []
current_metadata = dm.DependencyMetadata()
current_field_spec = None
current_field_name = None
current_field_value = ""
current_field_is_structured = _DEFAULT_TO_STRUCTURED_TEXT
for line in content.splitlines(keepends=True):
# Whether the current line should be part of a structured value.
if current_field_spec:
expect_structured_field_value = current_field_spec.is_structured()
else:
expect_structured_field_value = _DEFAULT_TO_STRUCTURED_TEXT
# Check if a new dependency is being described.
if DEPENDENCY_DIVIDER.match(line):
if current_field_name:
@@ -62,16 +69,17 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
if current_metadata.has_entries():
# Add the previous dependency to the results.
dependencies.append(current_metadata)
# Reset for the new dependency's metadata,
# and reset the field state.
current_metadata = dm.DependencyMetadata()
current_field_spec = None
current_field_name = None
current_field_value = ""
current_field_is_structured = False
elif (_PATTERN_KNOWN_FIELD_DECLARATION.match(line)
or (current_field_is_structured
and _PATTERN_FIELD_NAME_HEURISTIC.match(line))):
elif _PATTERN_KNOWN_FIELD_DECLARATION.match(line) or (
expect_structured_field_value
and _PATTERN_FIELD_NAME_HEURISTIC.match(line)):
# Save the field value to the current dependency's metadata.
if current_field_name:
current_metadata.add_entry(current_field_name,
@@ -79,25 +87,22 @@ def parse_content(content: str) -> List[dm.DependencyMetadata]:
current_field_name, current_field_value = line.split(
FIELD_DELIMITER, 1)
field = known_fields.get_field(current_field_name)
# Treats unknown fields as `_DEFAULT_TO_STRUCTURED_TEXT`.
current_field_is_structured = field.is_structured(
) if field else _DEFAULT_TO_STRUCTURED_TEXT
if field and field.is_one_liner():
# The field should be on one line, so add it now.
current_metadata.add_entry(current_field_name,
current_field_value)
# Reset the field state.
current_field_name = None
current_field_value = ""
current_field_spec = known_fields.get_field(current_field_name)
elif current_field_name:
# The field is on multiple lines, so add this line to the
# field value.
current_field_value += line
# Check if current field value indicates end of the field.
if current_field_spec and current_field_spec.should_terminate_field(
current_field_value):
assert current_field_name
current_metadata.add_entry(current_field_name, current_field_value)
current_field_spec = None
current_field_name = None
current_field_value = ""
# At this point, the end of the file has been reached.
# Save any remaining field data and metadata.
if current_field_name:

View File

@@ -0,0 +1,30 @@
Name: Test package 1
Local Modifications:
1. Modified X file
2. Deleted Y file
-------------------- DEPENDENCY DIVIDER --------------------
Name: Test package 2
Local Modifications: None
-------------------- DEPENDENCY DIVIDER --------------------
Name: Test package 3
Local Modifications:
None.
Uprev Instructions:
This section should not be extracted into Local Modifications.
1. Do X
2. Do Y
3. Commit
-------------------- DEPENDENCY DIVIDER --------------------
Name: Test package 4
Local Modifications: None,
Except modified file X.

View File

@@ -18,6 +18,6 @@ A test metadata file, with a
multi-line description.
Local Modifications:
None,
EXCEPT:
* nothing.
None.
Additional paragraphs describing the package.

View File

@@ -42,8 +42,7 @@ class FieldValidationTest(unittest.TestCase):
# Check validation of a freeform text field that should be on
# one line.
self._run_field_validation(
field=field_types.FreeformTextField("Freeform single",
one_liner=True),
field=field_types.SingleLineTextField("Text single line"),
valid_values=["Text on single line", "a", "1"],
error_values=["", "\n", " "],
)
@@ -51,8 +50,7 @@ class FieldValidationTest(unittest.TestCase):
# Check validation of a freeform text field that can span
# multiple lines.
self._run_field_validation(
field=field_types.FreeformTextField("Freeform multi",
one_liner=False),
field=field_types.FreeformTextField("Freeform multi"),
valid_values=[
"This is text spanning multiple lines:\n"
" * with this point\n"
@@ -203,6 +201,23 @@ class FieldValidationTest(unittest.TestCase):
warning_values=["0", "unknown"],
)
def test_local_modifications(self):
# Checks local modifications field early terminates when we can reasonably infer there's no modification.
_NO_MODIFICATION_VALUES = [
"None", "None.", "N/A.", "(none).", "No modification", "\nNone."
]
for value in _NO_MODIFICATION_VALUES:
self.assertTrue(
known_fields.LOCAL_MODIFICATIONS.should_terminate_field(value))
# Checks ambiguous values won't early terminate the field.
_MAY_CONTAIN_MODIFICATION_VALUES = [
"None. Except doing something.",
"Modify file X to include ....",
]
for value in _MAY_CONTAIN_MODIFICATION_VALUES:
self.assertFalse(
known_fields.LOCAL_MODIFICATIONS.should_terminate_field(value))
if __name__ == "__main__":
unittest.main()

View File

@@ -48,7 +48,7 @@ class ParseTest(unittest.TestCase):
("CPEPrefix", "unknown"),
("Description", "A test metadata file, with a\n"
" multi-line description."),
("Local Modifications", "None,\nEXCEPT:\n* nothing."),
("Local Modifications", "None."),
],
)
@@ -120,6 +120,47 @@ class ParseTest(unittest.TestCase):
],
)
def test_parse_multiple_local_modifications(self):
"""Check parsing works for multiple dependencies, each with different local modifications."""
filepath = os.path.join(
_THIS_DIR, "data", "README.chromium.test.multi-local-modifications")
content = gclient_utils.FileRead(filepath)
all_metadata = metadata.parse.parse_content(content)
self.assertEqual(len(all_metadata), 4)
self.assertListEqual(
all_metadata[0].get_entries(),
[
("Name", "Test package 1"),
("Local Modifications",
"1. Modified X file\n2. Deleted Y file"),
],
)
self.assertListEqual(
all_metadata[1].get_entries(),
[
("Name", "Test package 2"),
("Local Modifications", "None"),
],
)
self.assertListEqual(
all_metadata[2].get_entries(),
[
("Name", "Test package 3"),
("Local Modifications", "None."),
],
)
self.assertListEqual(
all_metadata[3].get_entries(),
[
("Name", "Test package 4"),
("Local Modifications", "None,\nExcept modified file X."),
],
)
if __name__ == "__main__":
unittest.main()

View File

@@ -4,7 +4,7 @@
# found in the LICENSE file.
import textwrap
from typing import Dict, List, Union
from typing import Dict, List, Optional
_CHROMIUM_METADATA_PRESCRIPT = "Third party metadata issue:"
_CHROMIUM_METADATA_POSTSCRIPT = ("Check //third_party/README.chromium.template "
@@ -77,7 +77,7 @@ class ValidationResult:
def set_tag(self, tag: str, value: str) -> bool:
self._tags[tag] = value
def get_tag(self, tag: str) -> Union[str, None]:
def get_tag(self, tag: str) -> Optional[str]:
return self._tags.get(tag)
def get_all_tags(self) -> Dict[str, str]: