[git cl split] Add new cluster-based splitting algorithm

This CL adds a new way of generating splittings, which clusters files
based on their directory structure (while respecting `set noparent` in
OWNERS files). The user provides a minimum and maximum acceptable
number of files per CL, and the algorithm attempts to create CLs in
that range with a single unique reviewer for each. I've tested it on
some example CLs, and it seems to work well -- certainly better than
the existing algorithm.

Usage of the new algorithm is triggered by passing the
`--target-range` option to `git cl split`. A second new argument,
`--expect-owners-override`, may also be used to ignore OWNERS during
the clustering algorithm.

Bug: 389069356
Change-Id: I321e6424b2bdb7624b370af23b73759fda261161
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/tools/depot_tools/+/6324373
Reviewed-by: Josip Sokcevic <sokcevic@chromium.org>
Commit-Queue: Devon Loehr <dloehr@google.com>
This commit is contained in:
Devon Loehr
2025-03-10 07:15:06 -07:00
committed by LUCI CQ
parent b123624ac6
commit 9dc0551d52
4 changed files with 356 additions and 27 deletions

View File

@@ -5815,7 +5815,8 @@ def CMDsplit(parser, args):
action='store_true', action='store_true',
default=False, default=False,
help='List the files and reviewers for each CL that would ' help='List the files and reviewers for each CL that would '
'be created, but don\'t create branches or CLs.') 'be created, but don\'t create branches or CLs.\n'
'You can pass -s in addition to get a more concise summary.')
parser.add_option('--cq-dry-run', parser.add_option('--cq-dry-run',
action='store_true', action='store_true',
default=False, default=False,
@@ -5834,7 +5835,7 @@ def CMDsplit(parser, args):
default=True, default=True,
help='Sends your change to the CQ after an approval. Only ' help='Sends your change to the CQ after an approval. Only '
'works on repos that have the Auto-Submit label ' 'works on repos that have the Auto-Submit label '
'enabled') 'enabled. This is the default option.')
parser.add_option( parser.add_option(
'--disable-auto-submit', '--disable-auto-submit',
action='store_false', action='store_false',
@@ -5859,11 +5860,20 @@ def CMDsplit(parser, args):
parser.add_option('--topic', parser.add_option('--topic',
default=None, default=None,
help='Topic to specify when uploading') help='Topic to specify when uploading')
parser.add_option(
'--target-range',
type='int',
default=None,
nargs=2,
help='Usage: --target-range <min> <max>\n '
'Use the alternate splitting algorithm which tries '
'to ensure each CL has between <min> and <max> files, inclusive. '
'Rarely, some CLs may have fewer files than specified.')
parser.add_option('--from-file', parser.add_option('--from-file',
type='str', type='str',
default=None, default=None,
help='If present, load the split CLs from the given file ' help='If present, load the split CLs from the given file '
'instead of computing a splitting. These file are ' 'instead of computing a splitting. These files are '
'generated each time the script is run.') 'generated each time the script is run.')
parser.add_option( parser.add_option(
'-s', '-s',
@@ -5881,18 +5891,30 @@ def CMDsplit(parser, args):
default=None, default=None,
help='If present, all generated CLs will be sent to the specified ' help='If present, all generated CLs will be sent to the specified '
'reviewer(s) specified, rather than automatically assigned reviewers.\n' 'reviewer(s) specified, rather than automatically assigned reviewers.\n'
'Multiple reviewers can be specified as ' 'Multiple reviewers can be specified as: '
'--reviewers a@b.com --reviewers c@d.com\n') '--reviewers a@b.com --reviewers c@d.com\n')
parser.add_option( parser.add_option(
'--no-reviewers', '--no-reviewers',
action='store_true', action='store_true',
help='If present, generated CLs will not be assigned reviewers. ' help='If present, generated CLs will not be assigned reviewers. '
'Overrides --reviewers.') 'Overrides --reviewers.')
parser.add_option(
'--expect-owners-override',
action='store_true',
help='If present, the clustering algorithm will group files by '
'directory only, without considering ownership.\n'
'No effect if --target-range is not passed.\n'
'Recommended to be used alongside --reviewers or --no-reviewers.')
options, _ = parser.parse_args(args) options, _ = parser.parse_args(args)
if not options.description_file and not options.dry_run: if not options.description_file and not options.dry_run:
parser.error('No --description flag specified.') parser.error('No --description flag specified.')
if (options.target_range
and options.target_range[0] > options.target_range[1]):
parser.error('First argument to --target-range cannot '
'be greater than the second argument.')
if options.no_reviewers: if options.no_reviewers:
options.reviewers = [] options.reviewers = []
@@ -5903,7 +5925,9 @@ def CMDsplit(parser, args):
Changelist, WrappedCMDupload, options.dry_run, Changelist, WrappedCMDupload, options.dry_run,
options.summarize, options.reviewers, options.summarize, options.reviewers,
options.cq_dry_run, options.enable_auto_submit, options.cq_dry_run, options.enable_auto_submit,
options.max_depth, options.topic, options.from_file, options.max_depth, options.topic,
options.target_range,
options.expect_owners_override, options.from_file,
settings.GetRoot()) settings.GetRoot())

View File

@@ -7,6 +7,7 @@
import collections import collections
import dataclasses import dataclasses
import hashlib import hashlib
import math
import os import os
import re import re
import tempfile import tempfile
@@ -197,17 +198,24 @@ def FormatDescriptionOrComment(txt, desc):
return replaced_txt return replaced_txt
def AddUploadedByGitClSplitToDescription(description): def AddUploadedByGitClSplitToDescription(description, is_experimental=False):
"""Adds a 'This CL was uploaded by git cl split.' line to |description|. """Adds a 'This CL was uploaded by git cl split.' line to |description|.
The line is added before footers, or at the end of |description| if it has The line is added before footers, or at the end of |description| if it has
no footers. no footers.
""" """
if is_experimental:
new_lines = [
'This CL was uploaded by an experimental version of git cl split',
'(https://crbug.com/389069356).'
]
else:
new_lines = ['This CL was uploaded by git cl split.']
split_footers = git_footers.split_footers(description) split_footers = git_footers.split_footers(description)
lines = split_footers[0] lines = split_footers[0]
if lines[-1] and not lines[-1].isspace(): if lines[-1] and not lines[-1].isspace():
lines = lines + [''] lines = lines + ['']
lines = lines + ['This CL was uploaded by git cl split.'] lines = lines + new_lines
if split_footers[1]: if split_footers[1]:
lines += [''] + split_footers[1] lines += [''] + split_footers[1]
return '\n'.join(lines) return '\n'.join(lines)
@@ -387,14 +395,16 @@ def PrintSummary(cl_infos, refactor_branch):
'The infra team reserves the right to cancel ' 'The infra team reserves the right to cancel '
'your jobs if they are overloading the CQ.\n\n' 'your jobs if they are overloading the CQ.\n\n'
'(Alternatively, you can reduce the number of CLs created by ' '(Alternatively, you can reduce the number of CLs created by '
'using the --max-depth option. Pass --dry-run to examine the ' 'using the --max-depth option, or altering the arguments to '
'--target-range, as appropriate. Pass --dry-run to examine the '
'CLs which will be created until you are happy with the ' 'CLs which will be created until you are happy with the '
'results.)') 'results.)')
def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run, def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run,
summarize, reviewers_override, cq_dry_run, enable_auto_submit, summarize, reviewers_override, cq_dry_run, enable_auto_submit,
max_depth, topic, from_file, repository_root): max_depth, topic, target_range, expect_owners_override, from_file,
repository_root):
""""Splits a branch into smaller branches and uploads CLs. """"Splits a branch into smaller branches and uploads CLs.
Args: Args:
@@ -414,10 +424,10 @@ def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run,
Returns: Returns:
0 in case of success. 1 in case of error. 0 in case of success. 1 in case of error.
""" """
description = LoadDescription(description_file, dry_run) description = LoadDescription(description_file, dry_run)
description = AddUploadedByGitClSplitToDescription(description)
description = AddUploadedByGitClSplitToDescription(
description, is_experimental=target_range)
comment = gclient_utils.FileRead(comment_file) if comment_file else None comment = gclient_utils.FileRead(comment_file) if comment_file else None
EnsureInGitRepository() EnsureInGitRepository()
@@ -443,6 +453,10 @@ def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run,
if from_file: if from_file:
cl_infos = LoadSplittingFromFile(from_file, files_on_disk=files) cl_infos = LoadSplittingFromFile(from_file, files_on_disk=files)
elif target_range:
min_files, max_files = target_range
cl_infos = GroupFilesByDirectory(cl, author, expect_owners_override,
files, min_files, max_files)
else: else:
files_split_by_reviewers = SelectReviewersForFiles( files_split_by_reviewers = SelectReviewersForFiles(
cl, author, files, max_depth) cl, author, files, max_depth)
@@ -783,6 +797,71 @@ def EditSplittingInteractively(
# Code for the clustering-based splitting algorithm. # Code for the clustering-based splitting algorithm.
################################################################################ ################################################################################
def GroupFilesByDirectory(cl, author: str, expect_owners_override: bool,
all_files: Tuple[str, str], min_files: int,
max_files: int) -> List[CLInfo]:
"""
Group the contents of |all_files| into clusters of size between |min_files|
and |max_files|, inclusive, based on their directory structure. Assign one
reviewer to each group to create a CL. If |expect_owners_override| is true,
consider only the directory structure of the files, ignoring ownership.
May rarely create groups with fewer than |min_files| files, or assign
multiple reviewers to a single CL.
Args:
cl: Changelist class instance, for calling owners methods
author: Email of person running the script; never assigned as a reviewer
"""
# Record the actions associated with each file because the clustering
# algorithm just takes filenames
actions_by_file = {}
file_paths = []
for (action, file) in all_files:
actions_by_file[file] = action
file_paths.append(file)
reviewers_so_far = []
cls = []
# Go through the clusters by path length so that we're likely to choose
# top-level owners earlier
for (directories, files) in sorted(
ClusterFiles(expect_owners_override, file_paths, min_files,
max_files)):
# Use '/' as a path separator in the branch name and the CL description
# and comment.
directories = [
directory.replace(os.path.sep, '/') for directory in directories
]
files_with_actions = [(actions_by_file[file], file) for file in files]
# Try to find a reviewer. If some of the files have noparent set,
# we'll likely get multiple reviewers. Don't consider reviewers we've
# already assigned something to.
# FIXME: Rather than excluding existing reviewers, it would be better
# to just penalize them, but still choose them over reviewers who have
# a worse score. At the moment, owners_client doesn't support anything
# to do with the score.
reviewers = cl.owners_client.SuggestMinimalOwners(
files,
exclude=[author, cl.owners_client.EVERYONE] + reviewers_so_far)
# Retry without excluding existing reviewers if we couldn't find any.
# This is very unlikely since there are many fallback owners.
if not reviewers:
reviewers = cl.owners_client.SuggestMinimalOwners(
directories, exclude=[author, cl.owners_client.EVERYONE])
reviewers_so_far.extend(reviewers)
cls.append(
CLInfo(set(reviewers), files_with_actions,
FormatDirectoriesForPrinting(directories)))
return cls
### Trie Code ### Trie Code
@@ -864,3 +943,137 @@ class DirectoryTrie():
for subdir in self.subdirectories.values(): for subdir in self.subdirectories.values():
files += subdir.ToList() files += subdir.ToList()
return files return files
### Clustering code
# Convenience type: a "bin" represents a collection of files:
# it tracks their prefix(es) and the list of files themselves.
# Both elements are string lists.
Bin = collections.namedtuple("Bin", "prefixes files")
def PackFiles(max_size: int, files_to_pack: List[Bin]) -> List[Bin]:
"""
Simple bin packing algorithm: given a list of small bins, consolidate them
into as few larger bins as possible, where each bin can hold at most
|max_size| files.
"""
bins = []
# Guess how many bins we'll need ahead of time so we can spread things
# between them. We'll add more bins later if necessary
expected_bins_needed = math.ceil(
sum(len(bin.files) for bin in files_to_pack) / max_size)
expected_avg_bin_size = math.ceil(
sum(len(bin.files) for bin in files_to_pack) / expected_bins_needed)
for _ in range(expected_bins_needed):
bins.append(Bin([], []))
# Sort by number of files, decreasing
sorted_by_num_files = sorted(files_to_pack, key=lambda bin: -len(bin.files))
# Invariant: the least-filled bin is always the first element of |bins|
# This ensures we spread things between bins as much as possible.
for (prefixes, files) in sorted_by_num_files:
b = bins[0]
if len(b.files) + len(files) <= max_size:
b[0].extend(prefixes)
b[1].extend(files)
else:
# Since the first bin is the emptiest, if we failed to fit in
# that we don't need to try any others.
# If these files alone are too large, split them up into
# groups of size |expected_avg_bin_size|
if len(files) > max_size:
bins.extend([
Bin(prefixes, files[i:i + expected_avg_bin_size])
for i in range(0, len(files), expected_avg_bin_size)
])
else:
bins.append(Bin(prefixes, files))
# Maintain invariant
bins.sort(key=lambda bin: len(bin.files))
return [bin for bin in bins if len(bin.files) > 0]
def ClusterFiles(expect_owners_override: bool, files: List[str], min_files: int,
max_files: int) -> List[Bin]:
"""
Group the entries of |files| into clusters of size between |min_files| and
|max_files|, inclusive. Guarantees that the size does not exceed
|max_files|, but the size may rarely be less than |min_files|. If
|expect_owners_override| is true, don't consider ownership when clustering,
only directory structure.
Clustering strategy for a given directory:
1. Try to group each subdirectory independently
2. Group any remaining files as follows:
2a. If there are less than |min_files| files and the folder has a parent,
give up and let the parent folder handle it.
2c. Otherwise, if there are at most |max_files| files, create one
cluster.
2c. Finally, if there are more than |max_files| files, create several
clusters of size less than |max_files|.
"""
trie = DirectoryTrie(expect_owners_override)
trie.AddFiles([file.split(os.path.sep) for file in files])
clusters: List[Bin] = []
def ClusterDirectory(current_dir: DirectoryTrie) -> List[str]:
"""
Attempt to cluster the files for a directory, by grouping them into
Bins and appending the bins to |clusters|.
Returns a list of files that weren't able to be clustered (because
there weren't at least |min_files| files).
"""
# Track all the files we need to handle in this directory
unclustered_files: List[Bin] = []
# Record any files that live in this directory directly
if len(current_dir.files) > 0:
unclustered_files.append(
Bin([current_dir.prefix], current_dir.files))
# Step 1: Try to cluster each subdirectory independently
for subdir in current_dir.subdirectories.values():
unclustered_files_in_subdir = ClusterDirectory(subdir)
# If not all files were submitted, record them
if len(unclustered_files_in_subdir) > 0:
unclustered_files.append(
Bin([subdir.prefix], unclustered_files_in_subdir))
# A flattened list containing just the names of all unclustered files
unclustered_files_names_only = [
file for bin in unclustered_files for file in bin.files
]
if len(unclustered_files_names_only) == 0:
return []
# Step 2a: If we don't have enough files for a cluster and it's possible
# to recurse upward, do so
if (len(unclustered_files_names_only) < min_files
and current_dir.has_parent):
return unclustered_files_names_only
# Step 2b, 2c: Create one or more clusters from the unclustered files
# by appending to the |clusters| variable in the outer scope
nonlocal clusters
if len(unclustered_files_names_only) <= max_files:
clusters.append(
Bin([current_dir.prefix], unclustered_files_names_only))
else:
clusters += PackFiles(max_files, unclustered_files)
return []
unclustered_paths = ClusterDirectory(trie)
if (len(unclustered_paths) > 0):
EmitWarning(
'Not all files were assigned to a CL!\n'
'This should be impossible, file a bug.\n'
f'{len(unclustered_paths)} Unassigned files: {unclustered_paths}')
return clusters

View File

@@ -5706,39 +5706,40 @@ class CMDSplitTestCase(CMDTestCaseBase):
def setUp(self): def setUp(self):
super(CMDTestCaseBase, self).setUp() super(CMDTestCaseBase, self).setUp()
mock.patch('git_cl.Settings.GetRoot', return_value='root').start() self.mock_get_root = mock.patch('git_cl.Settings.GetRoot',
return_value='root').start()
self.mock_split_cl = mock.patch("split_cl.SplitCl",
return_value=0).start()
self.mock_parser_error = mock.patch(
"git_cl.OptionParser.error", side_effect=ParserErrorMock).start()
@mock.patch("split_cl.SplitCl", return_value=0) def testDescriptionFlagRequired(self):
@mock.patch("git_cl.OptionParser.error", side_effect=ParserErrorMock)
def testDescriptionFlagRequired(self, _, mock_split_cl):
# --description-file is mandatory... # --description-file is mandatory...
self.assertRaises(ParserErrorMock, git_cl.main, ['split']) self.assertRaises(ParserErrorMock, git_cl.main, ['split'])
self.assertEqual(mock_split_cl.call_count, 0) self.assertEqual(self.mock_split_cl.call_count, 0)
self.assertEqual(git_cl.main(['split', '--description=SomeFile.txt']), self.assertEqual(git_cl.main(['split', '--description=SomeFile.txt']),
0) 0)
self.assertEqual(mock_split_cl.call_count, 1) self.assertEqual(self.mock_split_cl.call_count, 1)
# ...unless we're doing a dry run # ...unless we're doing a dry run
mock_split_cl.reset_mock() self.mock_split_cl.reset_mock()
self.assertEqual(git_cl.main(['split', '-n']), 0) self.assertEqual(git_cl.main(['split', '-n']), 0)
self.assertEqual(mock_split_cl.call_count, 1) self.assertEqual(self.mock_split_cl.call_count, 1)
@mock.patch("split_cl.SplitCl", return_value=0) def testReviewerParsing(self):
@mock.patch("git_cl.OptionParser.error", side_effect=ParserErrorMock)
def testReviewerParsing(self, _, mock_split_cl):
"""Make sure we correctly parse various combinations of --reviewers""" """Make sure we correctly parse various combinations of --reviewers"""
# Helper function to pull out the reviewers arg and compare it # Helper function to pull out the reviewers arg and compare it
def testOneSetOfFlags(flags, expected): def testOneSetOfFlags(flags, expected):
self.assertEqual(git_cl.main(['split', '-n'] + flags), 0) self.assertEqual(git_cl.main(['split', '-n'] + flags), 0)
mock_split_cl.assert_called_once() self.mock_split_cl.assert_called_once()
# It's unfortunate that there's no better way to get the argument # It's unfortunate that there's no better way to get the argument
# than to hardcode its number, unless we switch to using keyword # than to hardcode its number, unless we switch to using keyword
# arguments everywhere or pass the options in directly. # arguments everywhere or pass the options in directly.
reviewers_arg = mock_split_cl.call_args.args[6] reviewers_arg = self.mock_split_cl.call_args.args[6]
self.assertEqual(reviewers_arg, expected) self.assertEqual(reviewers_arg, expected)
mock_split_cl.reset_mock() self.mock_split_cl.reset_mock()
# If no --reviewers flag is passed, we should get None # If no --reviewers flag is passed, we should get None
testOneSetOfFlags([], None) testOneSetOfFlags([], None)
@@ -5750,6 +5751,34 @@ class CMDSplitTestCase(CMDTestCaseBase):
testOneSetOfFlags(['--no-reviewers'], []) testOneSetOfFlags(['--no-reviewers'], [])
testOneSetOfFlags(['--reviewers', 'a@b.com', '--no-reviewers'], []) testOneSetOfFlags(['--reviewers', 'a@b.com', '--no-reviewers'], [])
def testTargetRangeParsing(self):
# Helper function to pull out the target_range arg and compare it
def testOneSetOfFlags(flags, expected):
self.assertEqual(git_cl.main(['split', '-n'] + flags), 0)
self.mock_split_cl.assert_called_once()
# It's unfortunate that there's no better way to get the argument
# than to hardcode its number, unless we switch to using keyword
# arguments everywhere or pass the options in directly.
target_range = self.mock_split_cl.call_args.args[11]
self.assertEqual(target_range, expected)
self.mock_split_cl.reset_mock()
def ensureFailure(flags):
self.assertRaises(ParserErrorMock, git_cl.main,
(['split', '-n'] + flags))
testOneSetOfFlags([], None)
testOneSetOfFlags(["--target-range", "3", "5"], (3, 5))
testOneSetOfFlags(["--target-range", "3", "3"], (3, 3))
# Can't have second arg larger than first
ensureFailure(["--target-range", "5", "3"])
# Need exactly two args
ensureFailure(["--target-range"])
ensureFailure(["--target-range", "5"])
# Only accepts int args
ensureFailure(["--target-range", "5", "six"])
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig( logging.basicConfig(
level=logging.DEBUG if '-v' in sys.argv else logging.ERROR) level=logging.DEBUG if '-v' in sys.argv else logging.ERROR)

View File

@@ -33,17 +33,27 @@ class SplitClTest(unittest.TestCase):
footers = 'Bug: 12345' footers = 'Bug: 12345'
added_line = 'This CL was uploaded by git cl split.' added_line = 'This CL was uploaded by git cl split.'
experimental_lines = ("This CL was uploaded by an experimental version "
"of git cl split\n"
"(https://crbug.com/389069356).")
# Description without footers # Description without footers
self.assertEqual( self.assertEqual(
split_cl.AddUploadedByGitClSplitToDescription(description), split_cl.AddUploadedByGitClSplitToDescription(description),
description + added_line) description + added_line)
# Description with footers # Description with footers
self.assertEqual( self.assertEqual(
split_cl.AddUploadedByGitClSplitToDescription(description + split_cl.AddUploadedByGitClSplitToDescription(description +
footers), footers),
description + added_line + '\n\n' + footers) description + added_line + '\n\n' + footers)
# Description with footers and experimental flag
self.assertEqual(
split_cl.AddUploadedByGitClSplitToDescription(
description + footers, True),
description + experimental_lines + '\n\n' + footers)
@mock.patch("split_cl.EmitWarning") @mock.patch("split_cl.EmitWarning")
def testFormatDescriptionOrComment(self, mock_emit_warning): def testFormatDescriptionOrComment(self, mock_emit_warning):
description = "Converted use of X to Y in $description." description = "Converted use of X to Y in $description."
@@ -308,7 +318,7 @@ class SplitClTest(unittest.TestCase):
split_cl.SplitCl(description_file, None, mock.Mock(), mock.Mock(), split_cl.SplitCl(description_file, None, mock.Mock(), mock.Mock(),
dry_run, summarize, reviewers_override, False, dry_run, summarize, reviewers_override, False,
False, None, None, None, None) False, None, None, None, None, None, None)
# Save for re-use # Save for re-use
files_split_by_reviewers = { files_split_by_reviewers = {
@@ -555,7 +565,6 @@ class SplitClTest(unittest.TestCase):
self.assertEqual(contents, "".join(written_lines)) self.assertEqual(contents, "".join(written_lines))
mock_file_write.reset_mock() mock_file_write.reset_mock()
@mock.patch("os.path.isfile", return_value=False) @mock.patch("os.path.isfile", return_value=False)
def testDirectoryTrie(self, _): def testDirectoryTrie(self, _):
""" """
@@ -590,6 +599,60 @@ class SplitClTest(unittest.TestCase):
self.assertEqual(trie.subdirectories["a"].subdirectories["b"].prefix, self.assertEqual(trie.subdirectories["a"].subdirectories["b"].prefix,
os.path.join("a", "b")) os.path.join("a", "b"))
@mock.patch("os.path.isfile", return_value=False)
def testClusterFiles(self, _):
"""
Make sure ClusterFiles returns sensible results for some sample inputs.
"""
def compareClusterOutput(clusters: list[split_cl.Bin],
file_groups: list[list[str]]):
"""
Ensure that ClusterFiles grouped files the way we expected it to.
"""
clustered_files = sorted([sorted(bin.files) for bin in clusters])
file_groups = sorted([sorted(grp) for grp in file_groups])
self.assertEqual(clustered_files, file_groups)
# The clustering code uses OS paths so we need to do the same here
path_abc = os.path.join("a", "b", "c.cc")
path_abd = os.path.join("a", "b", "d.h")
path_aefgh = os.path.join("a", "e", "f", "g", "h.hpp")
path_ijk = os.path.join("i", "j", "k.cc")
path_ilm = os.path.join("i", "l", "m.cc")
path_an = os.path.join("a", "n.cpp")
path_top = os.path.join("top.gn")
files = [
path_abc, path_abd, path_aefgh, path_ijk, path_ilm, path_an,
path_top
]
def checkClustering(min_files, max_files, expected):
clusters = split_cl.ClusterFiles(False, files, min_files, max_files)
compareClusterOutput(clusters, expected)
# Each file gets its own cluster
individual_files = [[file] for file in files]
checkClustering(1, 1, individual_files)
# Put both entries of a/b in the same cluster, everything else alone
ab_together = [[path_abc, path_abd], [path_aefgh], [path_ijk],
[path_ilm], [path_an], [path_top]]
checkClustering(1, 2, ab_together)
checkClustering(1, 100, ab_together)
# Groups of 2: a/b, rest of a/, all of i/.
a_two_groups = [[path_abc, path_abd], [path_aefgh, path_an],
[path_ijk, path_ilm], [path_top]]
checkClustering(2, 2, a_two_groups)
checkClustering(3, 3, a_two_groups)
# Put all of a/ together and all of i/ together.
# Don't combine top-level directories with things at the root
by_top_level_dir = [[path_abc, path_abd, path_aefgh, path_an],
[path_ijk, path_ilm], [path_top]]
checkClustering(3, 5, by_top_level_dir)
checkClustering(100, 200, by_top_level_dir)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()