Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefer direct conflicting causes when backtracking #12499

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions news/12459.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
When there are complex conflicting requirements use much faster backtracking choices
184 changes: 184 additions & 0 deletions src/pip/_internal/resolution/resolvelib/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
Dict,
Iterable,
Iterator,
List,
Mapping,
Sequence,
Set,
TypeVar,
Union,
)
Expand Down Expand Up @@ -75,6 +77,133 @@ def _get_with_identifier(
return default


def _extract_names_from_causes_and_parents(
causes: Iterable["PreferenceInformation"],
) -> Set[str]:
"""
Utility function to extract names from the causes and their parent packages

:params causes: An iterable of PreferenceInformation

Returns a set of strings, each representing the name of a requirement or
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Returns a set of strings, each representing the name of a requirement or
Returns a set of strings, representing the name of a requirement and

The code adds both the name and the parent's name if there's a parent. (There may be a better way of re-wording this, I don't think Github's "suggestion" mechanism allows for multi-line changes...)

its parent package that was in causes
"""
causes_names = set()
for cause in causes:
causes_names.add(cause.requirement.name)
if cause.parent:
causes_names.add(cause.parent.name)

return causes_names


def _causes_with_conflicting_parent(
causes: Iterable["PreferenceInformation"],
) -> List["PreferenceInformation"]:
"""
Identifies causes that conflict because their parent package requirements
are not satisfied by another cause, or vice versa.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still struggling to understand the longic here. I'd suggest 2 improvements.

  1. Clarify the above sentence - I don't really understand what "their parent package requirements are not satisfied by another cause" means, or why the parent package requirements should be satisfied by another cause.
  2. Describe the data structure of "causes" here, explaining the elements that we're using. Something like "each cause contains a requirement specifier, and a "parent", which is the candidate which has that requirement. The list of causes only contains elements that have been detected by the resolver to be in conflict somehow." (I'm assuming that explanation is at least in part wrong, as it's not enough for me to be able to make sense of the function code...)


:params causes: An iterable sequence of PreferenceInformation

Returns a list of PreferenceInformation objects that represent the causes
where their parent conflicts
"""
# Avoid duplication by keeping track of already identified conflicting
# causes by their id
conflicting_causes_by_id: dict[int, "PreferenceInformation"] = {}
all_causes_by_id = {id(c): c for c in causes}

# Map cause IDs and parent packages by parent name for quick lookup
causes_ids_and_parents_by_parent_name: dict[
str, list[tuple[int, Candidate]]
] = collections.defaultdict(list)
for cause_id, cause in all_causes_by_id.items():
if cause.parent:
causes_ids_and_parents_by_parent_name[cause.parent.name].append(
(cause_id, cause.parent)
)

# Identify a cause's requirement conflicts with another cause's parent
for cause_id, cause in all_causes_by_id.items():
if cause_id in conflicting_causes_by_id:
continue

cause_id_and_parents = causes_ids_and_parents_by_parent_name.get(
cause.requirement.name
)
if not cause_id_and_parents:
continue

for other_cause_id, parent in cause_id_and_parents:
if not cause.requirement.is_satisfied_by(parent):
conflicting_causes_by_id[cause_id] = cause
conflicting_causes_by_id[other_cause_id] = all_causes_by_id[
other_cause_id
]

return list(conflicting_causes_by_id.values())


def _first_causes_with_no_candidates(
causes: Sequence["PreferenceInformation"],
candidates: Mapping[str, Iterator[Candidate]],
) -> List["PreferenceInformation"]:
"""
Checks for causes that have no possible candidates to satisfy their
requirements. Returns first causes found as iterating candidates can
be expensive due to downloading and building packages.

:params causes: A sequence of PreferenceInformation
:params candidates: A mapping of package names to iterators of their candidates

Returns a list containing the first pair of PreferenceInformation objects
that were found which had no satisfying candidates, else if all causes
had at least some satisfying candidate an empty list is returned.
"""
# Group causes by package name to reduce the comparison complexity.
causes_by_name: dict[str, list["PreferenceInformation"]] = collections.defaultdict(
list
)
for cause in causes:
causes_by_name[cause.requirement.project_name].append(cause)

# Check for cause pairs within the same package that have incompatible specifiers.
for cause_name, causes_list in causes_by_name.items():
if len(causes_list) < 2:
continue

while causes_list:
cause = causes_list.pop()
candidate = cause.requirement.get_candidate_lookup()[1]
if candidate is None:
continue

for other_cause in causes_list:
other_candidate = other_cause.requirement.get_candidate_lookup()[1]
if other_candidate is None:
continue

# Check if no candidate can match the combined specifier
combined_specifier = candidate.specifier & other_candidate.specifier
possible_candidates = candidates.get(cause_name)

# If no candidates have been provided then by default the
# causes have no candidates
if possible_candidates is None:
return [cause, other_cause]

# Use any and contains version here instead of filter so
# if a version is found that matches it will short curcuit
# iterating through possible candidates
if not any(
combined_specifier.contains(c.version) for c in possible_candidates
):
return [cause, other_cause]

return []


class PipProvider(_ProviderBase):
"""Pip's provider implementation for resolvelib.

Expand Down Expand Up @@ -253,3 +382,58 @@ def is_backtrack_cause(
if backtrack_cause.parent and identifier == backtrack_cause.parent.name:
return True
return False

def narrow_requirement_selection(
self,
identifiers: Iterable[str],
resolutions: Mapping[str, Candidate],
candidates: Mapping[str, Iterator[Candidate]],
information: Mapping[str, Iterable["PreferenceInformation"]],
backtrack_causes: Sequence["PreferenceInformation"],
) -> Iterable[str]:
"""
Narrows down the selection of requirements to consider for the next
resolution step.

This method uses principles of conflict-driven clause learning (CDCL)
to focus on the closest conflicts first.

:params identifiers: Iterable of requirement names currently under
consideration.
:params resolutions: Current mapping of resolved package identifiers
to their selected candidates.
:params candidates: Mapping of each package's possible candidates.
:params information: Mapping of requirement information for each package.
:params backtrack_causes: Sequence of requirements, if non-empty,
were the cause of the resolver backtracking

Returns:
An iterable of requirement names that the resolver will use to
limit the next step of resolution
"""

# If there are 2 or less causes then finding conflicts between
# them is not required as there will always be a minumum of two
# conflicts
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd reword this as "unless there are more than 2 conflicts, there's nothing to simplify (because a conflict always involves at least 2 causes)".

if len(backtrack_causes) < 3:
return identifiers

# First, try to resolve direct causes based on conflicting parent packages
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# First, try to resolve direct causes based on conflicting parent packages
# If some of the causes have conflicting parents, focus on them.

direct_causes = _causes_with_conflicting_parent(backtrack_causes)
if not direct_causes:
# If no conflicting parent packages found try to find some causes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# If no conflicting parent packages found try to find some causes
# Otherwise, try to find some causes

# that share the same requirement name but no common candidate,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# that share the same requirement name but no common candidate,
# that share the same requirement name but no common candidate.

# we take the first one of these as iterating through candidates
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# we take the first one of these as iterating through candidates
# Stop after finding the first one of these as iterating through candidates

# is potentially expensive
direct_causes = _first_causes_with_no_candidates(
backtrack_causes, candidates
)
if direct_causes:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if direct_causes:
# If we found some causes worth focusing on, use them.
if direct_causes:

backtrack_causes = direct_causes

# Filter identifiers based on the narrowed down causes.
unsatisfied_causes_names = set(
identifiers
) & _extract_names_from_causes_and_parents(backtrack_causes)

return unsatisfied_causes_names if unsatisfied_causes_names else identifiers
52 changes: 52 additions & 0 deletions src/pip/_vendor/resolvelib/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,55 @@ def resolve(self, requirements, **kwargs):
:raises: ``self.base_exception`` or its subclass.
"""
raise NotImplementedError

def narrow_requirement_selection(
self, identifiers, resolutions, candidates, information, backtrack_causes
):
"""
Narrows the selection of requirements being considered during
resolution.

The requirement selection is defined as "The possible requirements
that will be resolved next." If a requirement isn't part of the returned
iterable, it will not be considered during the next step of resolution.

:param identifiers: An iterable of `identifier` as returned by
``identify()``. These identify all requirements currently being
considered.
:param resolutions: Mapping of candidates currently pinned by the
resolver. Each key is an identifier, and the value is a candidate.
The candidate may conflict with requirements from ``information``.
:param candidates: Mapping of each dependency's possible candidates.
Each value is an iterator of candidates.
:param information: Mapping of requirement information of each package.
Each value is an iterator of *requirement information*.
:param backtrack_causes: Sequence of *requirement information* that are
the requirements that caused the resolver to most recently
backtrack.

A *requirement information* instance is a named tuple with two members:

* ``requirement`` specifies a requirement contributing to the current
list of candidates.
* ``parent`` specifies the candidate that provides (depended on) the
requirement, or ``None`` to indicate a root requirement.

Must return a non-empty subset of `identifiers`, with the simplest
implementation being to return `identifiers` unchanged.

Can be used by the provider to optimize the dependency resolution
process. `get_preference` will only be called on the identifiers
returned. If there is only one identifier returned, then `get_preference`
won't be called at all.

Serving a similar purpose as `get_preference`, this method allows the
provider to guide resolvelib through the resolution process. It should
be used instead of `get_preference` when the provider needs to consider
multiple identifiers simultaneously, or when the provider wants to skip
checking all identifiers, e.g., because the checks are prohibitively
expensive.

Returns:
Iterable[KT]: A non-empty subset of `identifiers`.
"""
return identifiers
42 changes: 37 additions & 5 deletions src/pip/_vendor/resolvelib/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,12 @@ def _patch_criteria():
# No way to backtrack anymore.
return False

def _extract_causes(self, criteron):
"""Extract causes from list of criterion and deduplicate"""
return list(
{id(i): i for c in criteron for i in c.information}.values()
)

def resolve(self, requirements, max_rounds):
if self._states:
raise RuntimeError("already resolved")
Expand Down Expand Up @@ -422,12 +428,38 @@ def resolve(self, requirements, max_rounds):
unsatisfied_names
)

# Choose the most preferred unpinned criterion to try.
name = min(unsatisfied_names, key=self._get_preference)
failure_causes = self._attempt_to_pin_criterion(name)
if len(unsatisfied_names) > 1:
narrowed_unstatisfied_names = list(
self._p.narrow_requirement_selection(
identifiers=unsatisfied_names,
resolutions=self.state.mapping,
candidates=IteratorMapping(
self.state.criteria,
operator.attrgetter("candidates"),
),
information=IteratorMapping(
self.state.criteria,
operator.attrgetter("information"),
),
backtrack_causes=self.state.backtrack_causes,
)
)
else:
narrowed_unstatisfied_names = unsatisfied_names

# If there is only 1 unsatisfied name skip calling self._get_preference
if len(narrowed_unstatisfied_names) > 1:
# Choose the most preferred unpinned criterion to try.
name = min(
narrowed_unstatisfied_names, key=self._get_preference
)
else:
name = narrowed_unstatisfied_names[0]

failure_criterion = self._attempt_to_pin_criterion(name)

if failure_causes:
causes = [i for c in failure_causes for i in c.information]
if failure_criterion:
causes = self._extract_causes(failure_criterion)
# Backjump if pinning fails. The backjump process puts us in
# an unpinned state, so we can work on it in the next round.
self._r.resolving_conflicts(causes=causes)
Expand Down