Skip to content

Commit

Permalink
Add support for dynamic fields in facets
Browse files Browse the repository at this point in the history
Also add FSDSINET class facet, and "audio with proboblems" filter
  • Loading branch information
ffont committed Apr 12, 2024
1 parent 35777e2 commit 346f445
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 97 deletions.
4 changes: 4 additions & 0 deletions freesound/settings.py
Expand Up @@ -626,6 +626,10 @@
SEARCH_SOUNDS_FIELD_LICENSE_NAME: {'limit': 10},
}

SEARCH_SOUNDS_BETA_FACETS = {
'fsdsinet_detected_class': {'limit': 30},
}

SEARCH_FORUM_SORT_OPTION_THREAD_DATE_FIRST = "Thread creation (newest first)"
SEARCH_FORUM_SORT_OPTION_DATE_NEW_FIRST = "Post creation (newest first)"
SEARCH_FORUM_SORT_OPTIONS_WEB = [
Expand Down
31 changes: 20 additions & 11 deletions search/templatetags/search.py
Expand Up @@ -24,23 +24,32 @@

from sounds.models import License
from utils.search import search_query_processor_options
from utils.search.backends.solr555pysolr import FIELD_NAMES_MAP
from utils.tags import annotate_tags

register = template.Library()


@register.inclusion_tag('search/facet.html', takes_context=True)
def display_facet(context, facet_name):
def display_facet(context, facet_name, facet_title=None, facet_type='list'):
sqp = context['sqp']
facets = context['facets']
facet_type = {'tag': 'cloud', 'username': 'cloud'}.get(facet_name, 'list')
facet_title = {
'tag': 'Related tags',
'username': 'Related users',
'grouping_pack': 'Packs',
'license': 'Licenses'
}.get(facet_name, facet_name.capitalize())
if facet_title is None:
facet_title = facet_name.capitalize()

solr_fieldname = FIELD_NAMES_MAP.get(facet_name, facet_name)

if facet_name in facets:
# If a facet contains a value which is already used in a filter (this can hapen with facets with multiple values like
# tags), then we remove it from the list of options so we don't show redundant information
facet_values_to_skip = []
for field_name_value in sqp.get_active_filters():
if field_name_value.startswith(solr_fieldname + ':'):
facet_values_to_skip.append(field_name_value.split(':')[1].replace('"', ''))
if facet_values_to_skip:
facets[facet_name] = [f for f in facets[facet_name] if f[0] not in facet_values_to_skip]

# Annotate facet elements with size values used in the tag cloud (this is not useulf for all facets)
facet = annotate_tags([dict(value=f[0], count=f[1]) for f in facets[facet_name] if f[0] != "0"],
sort="value", small_size=0.7, large_size=2.0)
else:
Expand Down Expand Up @@ -90,13 +99,13 @@ def display_facet(context, facet_name):
# Set the URL to add facet values as filters
if element["value"].startswith('('):
# If the filter value is a "complex" operation , don't wrap it in quotes
filter_str = f'{facet_name}:{element["value"]}'
filter_str = f'{solr_fieldname}:{element["value"]}'
elif element["value"].isdigit():
# If the filter value is a digit, also don't wrap it in quotes
filter_str = f'{facet_name}:{element["value"]}'
filter_str = f'{solr_fieldname}:{element["value"]}'
else:
# Otherwise wrap in quotes
filter_str = f'{facet_name}:"{element["value"]}"'
filter_str = f'{solr_fieldname}:"{element["value"]}"'
element['add_filter_url'] = sqp.get_url(add_filters=[filter_str])

# We sort the facets by count. Also, we apply an opacity filter on "could" type facets
Expand Down
6 changes: 0 additions & 6 deletions search/views.py
Expand Up @@ -153,12 +153,6 @@ def search_view_helper(request):
'query_time': results.q_time
}))

# For the facets of fields that could have mulitple values (i.e. currently, only "tags" facet), make
# sure to remove the filters for the corresponding facet field that are already active (so we remove
# redundant information)
if 'tag' in results.facets:
results.facets['tag'] = [(tag, count) for tag, count in results.facets['tag'] if tag not in sqp.get_tags_in_filters()]

# Compile template variables
return {
'sqp': sqp,
Expand Down
17 changes: 9 additions & 8 deletions templates/search/search.html
Expand Up @@ -165,13 +165,14 @@ <h3>
</div>
<div class="row">
<div class="col-4 v-padding-1">
<div class="v-spacing-top-1">{% display_search_option "include_audio_problems" %}</div>
<div class="v-spacing-top-1">{% display_search_option "compute_clusters" %}</div>
<div class="v-spacing-top-1">{% display_search_option "similar_to" %}</div>
</div>
<div class="col-4 v-padding-1">
<div class="v-spacing-top-1">{% display_search_option "include_audio_problems" %}</div>
<div class="col-8 v-padding-1">
<div class="v-spacing-top-1 v-spacing-negative-1 text-grey">FSDSINET class:</div>
{% display_facet "fsdsinet_detected_class" "" "cloud" %}
</div>

</div>
{% endif %}
</div>
Expand Down Expand Up @@ -210,19 +211,19 @@ <h3>
{% comment %}facets{% endcomment%}
<aside class="col-md-4 col-lg-3 collapsable-block md-max-h-100" id="collapsable-filters">
{% if sqp.tags_mode_active %}
{% display_facet "tag" %}
{% display_facet "tags" "Tags" "cloud" %}
{% endif %}
{% display_facet "license" %}
{% display_facet "license" "Licenses" %}
{% if not sqp.tags_mode_active %}
{% display_facet "tag" %}
{% display_facet "tags" "Tags" "cloud" %}
{% endif %}
{% display_facet "type" %}
{% display_facet "samplerate" %}
{% display_facet "channels" %}
{% if not sqp.display_as_packs_active %}
{% display_facet "grouping_pack" %}
{% display_facet "pack_grouping" "Packs" %}
{% endif %}
{% display_facet "username" %}
{% display_facet "username" "Users" "cloud" %}
{% display_facet "bitdepth" %}
{% display_facet "bitrate" %}
</aside>
Expand Down
149 changes: 85 additions & 64 deletions utils/search/backends/solr555pysolr.py
Expand Up @@ -57,6 +57,8 @@
settings.SEARCH_SOUNDS_FIELD_LICENSE_NAME: 'license'
}

REVERSE_FIELD_NAMES_MAP = {value: key for key, value in FIELD_NAMES_MAP.items()}


# Map "web" sorting options to solr sorting options
SORT_OPTIONS_MAP = {
Expand Down Expand Up @@ -309,11 +311,10 @@ def convert_post_to_search_engine_document(self, post):
"has_posts": False if post.thread.num_posts == 0 else True
}
return document

def add_solr_suffix_to_dynamic_fieldname(self, fieldname):
"""Add the corresponding SOLR dynamic field suffix to the given fieldname. If the fieldname does not correspond
to a dynamic field, leave it unchanged. See docstring in 'add_solr_suffix_to_dynamic_fieldnames_in_filter' for
more information"""

def get_dynamic_fields_map(self):
if hasattr(self, '_dynamic_fields_map'):
return self._dynamic_fields_map
dynamic_fields_map = {}
for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items():
if 'descriptors_map' in analyzer_data:
Expand All @@ -322,7 +323,14 @@ def add_solr_suffix_to_dynamic_fieldname(self, fieldname):
if descriptor_type is not None:
dynamic_fields_map[db_descriptor_key] = '{}{}'.format(
db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type])
return dynamic_fields_map.get(fieldname, fieldname)
self._dynamic_fields_map = dynamic_fields_map
return dynamic_fields_map

def add_solr_suffix_to_dynamic_fieldname(self, fieldname):
"""Add the corresponding SOLR dynamic field suffix to the given fieldname. If the fieldname does not correspond
to a dynamic field, leave it unchanged. E.g. 'ac_tonality' -> 'ac_tonality_s'. See docstring in
'add_solr_suffix_to_dynamic_fieldnames_in_filter' for more information"""
return self.get_dynamic_fields_map().get(fieldname, fieldname)

def add_solr_suffix_to_dynamic_fieldnames_in_filter(self, query_filter):
"""Processes a filter string containing field names and replaces the occurrences of fieldnames that match with
Expand All @@ -331,16 +339,25 @@ def add_solr_suffix_to_dynamic_fieldnames_in_filter(self, query_filter):
fields which need to end with a specific suffi that SOLR uses to learn about the type of the field and how it
should treat it.
"""
for analyzer, analyzer_data in settings.ANALYZERS_CONFIGURATION.items():
if 'descriptors_map' in analyzer_data:
descriptors_map = settings.ANALYZERS_CONFIGURATION[analyzer]['descriptors_map']
for _, db_descriptor_key, descriptor_type in descriptors_map:
if descriptor_type is not None:
query_filter = query_filter.replace(
f'{db_descriptor_key}:','{}{}:'.format(
db_descriptor_key, SOLR_DYNAMIC_FIELDS_SUFFIX_MAP[descriptor_type]))
for raw_fieldname, solr_fieldname in self.get_dynamic_fields_map().items():
query_filter = query_filter.replace(
f'{raw_fieldname}:', f'{solr_fieldname}:')
return query_filter


def remove_solr_suffix_from_dynamic_fieldname(self, fieldname):
"""Removes the solr dynamic field suffix from the given fieldname (if any). E.g. 'ac_tonality_s' -> 'ac_tonality'"""
for suffix in SOLR_DYNAMIC_FIELDS_SUFFIX_MAP.values():
if fieldname.endswith(suffix):
return fieldname[:-len(suffix)]
return fieldname

def get_solr_fieldname(self, fieldname):
return self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(fieldname, fieldname))

def get_original_fieldname(self, solr_fieldname):
solr_fieldname_no_suffix = self.remove_solr_suffix_from_dynamic_fieldname(solr_fieldname)
return REVERSE_FIELD_NAMES_MAP.get(solr_fieldname_no_suffix, solr_fieldname_no_suffix)

def search_process_sort(self, sort, forum=False):
"""Translates sorting criteria to solr sort criteria and add extra criteria if sorting by ratings.
Expand Down Expand Up @@ -382,51 +399,51 @@ def search_filter_make_intersection(self, query_filter):
return query_filter

def search_process_filter(self, query_filter, only_sounds_within_ids=False, only_sounds_with_pack=False):
"""Process the filter to make a number of adjustments
1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names).
2) If only sounds with pack should be returned, add such a filter.
3) Add filter for sound IDs if only_sounds_within_ids is passed.
4) Rewrite geotag bounding box queries to use solr 5+ syntax
Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo ->
ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float),
'*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer
descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the
suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name.
Args:
query_filter (str): query filter string.
only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack
only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs
Returns:
str: processed filter query string.
"""
# Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields
query_filter = self.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)

# If we only want sounds with packs and there is no pack filter, add one
if only_sounds_with_pack and not 'pack:' in query_filter:
query_filter += ' pack:*'

if 'geotag:"Intersects(' in query_filter:
# Replace geotag:"Intersects(<MINIMUM_LONGITUDE> <MINIMUM_LATITUDE> <MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>)"
# with geotag:["<MINIMUM_LATITUDE>, <MINIMUM_LONGITUDE>" TO "<MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>"]
query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter)

query_filter = self.search_filter_make_intersection(query_filter)

# When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter
# our query to the sounds in that list of IDs.
if only_sounds_within_ids:
sounds_within_ids_filter = ' OR '.join(['id:{}'.format(sound_id) for sound_id in only_sounds_within_ids])
if query_filter:
query_filter += ' AND ({})'.format(sounds_within_ids_filter)
else:
query_filter = '({})'.format(sounds_within_ids_filter)
"""Process the filter to make a number of adjustments
1) Add type suffix to human-readable audio analyzer descriptor names (needed for dynamic solr field names).
2) If only sounds with pack should be returned, add such a filter.
3) Add filter for sound IDs if only_sounds_within_ids is passed.
4) Rewrite geotag bounding box queries to use solr 5+ syntax
Step 1) is used for the dynamic field names used in Solr (e.g. ac_tonality -> ac_tonality_s, ac_tempo ->
ac_tempo_i). The dynamic field names we define in Solr schema are '*_b' (for bool), '*_d' (for float),
'*_i' (for integer) and '*_s' (for string). At indexing time, we append these suffixes to the analyzer
descriptor names that need to be indexed so Solr can treat the types properly. Now we automatically append the
suffices to the filter names so users do not need to deal with that and Solr understands recognizes the field name.
Args:
query_filter (str): query filter string.
only_sounds_with_pack (bool, optional): whether to only include sounds that belong to a pack
only_sounds_within_ids (List[int], optional): restrict search results to sounds with these IDs
Returns:
str: processed filter query string.
"""
# Add type suffix to human-readable audio analyzer descriptor names which is needed for solr dynamic fields
query_filter = self.add_solr_suffix_to_dynamic_fieldnames_in_filter(query_filter)

return query_filter
# If we only want sounds with packs and there is no pack filter, add one
if only_sounds_with_pack and not 'pack:' in query_filter:
query_filter += ' pack:*'

if 'geotag:"Intersects(' in query_filter:
# Replace geotag:"Intersects(<MINIMUM_LONGITUDE> <MINIMUM_LATITUDE> <MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>)"
# with geotag:["<MINIMUM_LATITUDE>, <MINIMUM_LONGITUDE>" TO "<MAXIMUM_LONGITUDE> <MAXIMUM_LATITUDE>"]
query_filter = re.sub('geotag:"Intersects\((.+?) (.+?) (.+?) (.+?)\)"', r'geotag:["\2,\1" TO "\4,\3"]', query_filter)

query_filter = self.search_filter_make_intersection(query_filter)

# When calculating results form clustering, the "only_sounds_within_ids" argument is passed and we filter
# our query to the sounds in that list of IDs.
if only_sounds_within_ids:
sounds_within_ids_filter = ' OR '.join(['id:{}'.format(sound_id) for sound_id in only_sounds_within_ids])
if query_filter:
query_filter += ' AND ({})'.format(sounds_within_ids_filter)
else:
query_filter = '({})'.format(sounds_within_ids_filter)

return query_filter

def force_sounds(self, query_dict):
# Add an extra filter to the query parameters to make sure these return sound documents only
Expand Down Expand Up @@ -509,11 +526,11 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', fi
# If no fields provided, use the default
query_fields = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS
if isinstance(query_fields, list):
query_fields = [self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)) for field in query_fields]
query_fields = [self.get_solr_fieldname(field_name) for field_name in query_fields]
elif isinstance(query_fields, dict):
# Also remove fields with weight <= 0
query_fields = [(self.add_solr_suffix_to_dynamic_fieldname(FIELD_NAMES_MAP.get(field, field)), weight)
for field, weight in query_fields.items() if weight > 0]
query_fields = [(self.get_solr_fieldname(field_name), weight)
for field_name, weight in query_fields.items() if weight > 0]

# Set main query options
query.set_dismax_query(textual_query, query_fields=query_fields)
Expand Down Expand Up @@ -583,15 +600,15 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', fi
# Configure facets
if facets is not None:
json_facets = {}
facet_fields = [FIELD_NAMES_MAP[field_name] for field_name, _ in facets.items()]
facet_fields = [self.get_solr_fieldname(field_name) for field_name, _ in facets.items()]
for field in facet_fields:
json_facets[field] = SOLR_SOUND_FACET_DEFAULT_OPTIONS.copy()
json_facets[field]['field'] = field
if similar_to is not None:
# In similarity search we need to set the "domain" facet option to apply them to the parent documents of the child documents we will match
json_facets[field]['domain'] = {'blockParent': f'content_type:{SOLR_DOC_CONTENT_TYPES["sound"]}'}
for field_name, extra_options in facets.items():
json_facets[FIELD_NAMES_MAP[field_name]].update(extra_options)
json_facets[self.get_solr_fieldname(field_name)].update(extra_options)
query.set_facet_json_api(json_facets)

# Configure grouping
Expand Down Expand Up @@ -620,6 +637,10 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', fi
results = self.get_sounds_index().search(
**(self.force_sounds(query.as_kwargs()) if similar_to is None else query.as_kwargs()))

# Facets returned in results use the corresponding solr fieldnames as keys. We want to convert them to the
# original fieldnames so that the rest of the code can use them without knowing about the solr fieldnames.
results.facets = {self.get_original_fieldname(facet_name): data for facet_name, data in results.facets.items()}

# Solr uses a string for the id field, but django uses an int. Convert the id in all results to int
# before use to avoid issues
for d in results.docs:
Expand Down

0 comments on commit 346f445

Please sign in to comment.