mirror of
https://chromium.googlesource.com/chromium/tools/depot_tools.git
synced 2026-01-11 18:51:29 +00:00
This CL adds a "early terminate the field based on field value" parser mechanism to end the field as soon as the field value provides an unambiguous answer to the question we care about. This is to prevent over-extraction over certain fields (specifically, local modifications) which can either be a definitive answer (e.g. No modification) or multi-line free-form texts (which may contain unknown fields that we don't care about at this stage). This mitigates over extraction of README.chromium files like: ``` Local Modifications: None How to Uprev: Steps... ``` Where the old parser would extract "None\n\nHow to Uprev:\nSteps..." This CL also refactors single line fields to use the same early termination mechanism since single line field simply ends as soon as the line is parsed. Union[Something, None] is changed to Optional[Something] based on styleguide. Bug: b/324149233 Change-Id: I3fca80eaceb071263f8ae8730afda230fff0bbb0 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/5394917 Reviewed-by: Anne Redulla <aredulla@google.com> Commit-Queue: Jiewei Qian <qjw@chromium.org>
114 lines
4.2 KiB
Python
114 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright 2023 The Chromium Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from typing import List
|
|
|
|
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
|
|
# The repo's root directory.
|
|
_ROOT_DIR = os.path.abspath(os.path.join(_THIS_DIR, ".."))
|
|
|
|
# Add the repo's root directory for clearer imports.
|
|
sys.path.insert(0, _ROOT_DIR)
|
|
|
|
import metadata.fields.known as known_fields
|
|
import metadata.dependency_metadata as dm
|
|
|
|
# Line used to separate dependencies within the same metadata file.
|
|
DEPENDENCY_DIVIDER = re.compile(r"^-{20} DEPENDENCY DIVIDER -{20}$")
|
|
|
|
# Delimiter used to separate a field's name from its value.
|
|
FIELD_DELIMITER = ":"
|
|
|
|
# Heuristic for detecting unknown field names.
|
|
_PATTERN_FIELD_NAME_WORD_HEURISTIC = r"[A-Z]\w+"
|
|
_PATTERN_FIELD_NAME_HEURISTIC = re.compile(r"^({}(?: {})*){}[\b\s]".format(
|
|
_PATTERN_FIELD_NAME_WORD_HEURISTIC, _PATTERN_FIELD_NAME_WORD_HEURISTIC,
|
|
FIELD_DELIMITER))
|
|
_DEFAULT_TO_STRUCTURED_TEXT = False
|
|
|
|
# Pattern used to check if a line from a metadata file declares a new
|
|
# field.
|
|
_PATTERN_KNOWN_FIELD_DECLARATION = re.compile(
|
|
"^({}){}".format("|".join(known_fields.ALL_FIELD_NAMES), FIELD_DELIMITER),
|
|
re.IGNORECASE)
|
|
|
|
|
|
def parse_content(content: str) -> List[dm.DependencyMetadata]:
|
|
"""Reads and parses the metadata from the given string.
|
|
|
|
Args:
|
|
content: the string to parse metadata from.
|
|
|
|
Returns: all the metadata, which may be for zero or more
|
|
dependencies, from the given string.
|
|
"""
|
|
dependencies = []
|
|
current_metadata = dm.DependencyMetadata()
|
|
current_field_spec = None
|
|
current_field_name = None
|
|
current_field_value = ""
|
|
|
|
for line in content.splitlines(keepends=True):
|
|
# Whether the current line should be part of a structured value.
|
|
if current_field_spec:
|
|
expect_structured_field_value = current_field_spec.is_structured()
|
|
else:
|
|
expect_structured_field_value = _DEFAULT_TO_STRUCTURED_TEXT
|
|
|
|
# Check if a new dependency is being described.
|
|
if DEPENDENCY_DIVIDER.match(line):
|
|
if current_field_name:
|
|
# Save the field value for the previous dependency.
|
|
current_metadata.add_entry(current_field_name,
|
|
current_field_value)
|
|
if current_metadata.has_entries():
|
|
# Add the previous dependency to the results.
|
|
dependencies.append(current_metadata)
|
|
|
|
# Reset for the new dependency's metadata,
|
|
# and reset the field state.
|
|
current_metadata = dm.DependencyMetadata()
|
|
current_field_spec = None
|
|
current_field_name = None
|
|
current_field_value = ""
|
|
|
|
elif _PATTERN_KNOWN_FIELD_DECLARATION.match(line) or (
|
|
expect_structured_field_value
|
|
and _PATTERN_FIELD_NAME_HEURISTIC.match(line)):
|
|
# Save the field value to the current dependency's metadata.
|
|
if current_field_name:
|
|
current_metadata.add_entry(current_field_name,
|
|
current_field_value)
|
|
|
|
current_field_name, current_field_value = line.split(
|
|
FIELD_DELIMITER, 1)
|
|
current_field_spec = known_fields.get_field(current_field_name)
|
|
|
|
elif current_field_name:
|
|
# The field is on multiple lines, so add this line to the
|
|
# field value.
|
|
current_field_value += line
|
|
|
|
# Check if current field value indicates end of the field.
|
|
if current_field_spec and current_field_spec.should_terminate_field(
|
|
current_field_value):
|
|
assert current_field_name
|
|
current_metadata.add_entry(current_field_name, current_field_value)
|
|
current_field_spec = None
|
|
current_field_name = None
|
|
current_field_value = ""
|
|
|
|
# At this point, the end of the file has been reached.
|
|
# Save any remaining field data and metadata.
|
|
if current_field_name:
|
|
current_metadata.add_entry(current_field_name, current_field_value)
|
|
if current_metadata.has_entries():
|
|
dependencies.append(current_metadata)
|
|
|
|
return dependencies
|