Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit 949a538

Browse files
authored
feat(search): Support Magik language file filter in search (#63110)
Adds support to the lang file filter for Magik language (`lang: Magik`). In order to do that we add wrappers around go-enry and update search code to use them. This provides flexibility for us in the future to support other languages that are not in Linguist as well.
1 parent 79773e5 commit 949a538

File tree

11 files changed

+211
-53
lines changed

11 files changed

+211
-53
lines changed

client/common/src/languages.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,20 @@ export const POPULAR_LANGUAGES: string[] = [
4646
'Zig',
4747
]
4848

49+
/**
50+
* Languages that are not present in the go-enry library
51+
* and so are not in go-enry either
52+
*/
53+
const LANGUAGES_NOT_IN_ENRY = [
54+
// Not in Linguist and they are not likely to add
55+
'Magik',
56+
57+
// Add to linguist on 6/7/24
58+
// can remove once go-enry package updates
59+
// to that linguist version
60+
'Pkl',
61+
]
62+
4963
/**
5064
* A list of all supported languages, ranking the popular languages higher by
5165
* including them first.
@@ -721,4 +735,6 @@ export const ALL_LANGUAGES = uniq(
721735
'wisp',
722736
'xBase',
723737
])
738+
.concat(LANGUAGES_NOT_IN_ENRY)
739+
.sort()
724740
)

internal/search/job/jobutil/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ go_library(
5757
"//internal/telemetry/telemetryrecorder",
5858
"//internal/trace",
5959
"//internal/usagestats",
60+
"//lib/codeintel/languages",
6061
"//lib/errors",
6162
"//lib/iterator",
6263
"//schema",
63-
"@com_github_go_enry_go_enry_v2//:go-enry",
6464
"@com_github_grafana_regexp//:regexp",
6565
"@com_github_sourcegraph_conc//pool",
6666
"@com_github_sourcegraph_log//:log",

internal/search/job/jobutil/job.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"strings"
66
"time"
77

8-
"github.com/go-enry/go-enry/v2"
98
"github.com/grafana/regexp"
109

1110
zoektquery "github.com/sourcegraph/zoekt/query"
@@ -28,6 +27,7 @@ import (
2827
"github.com/sourcegraph/sourcegraph/internal/search/structural"
2928
"github.com/sourcegraph/sourcegraph/internal/search/zoekt"
3029
"github.com/sourcegraph/sourcegraph/internal/searcher/protocol"
30+
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
3131
"github.com/sourcegraph/sourcegraph/lib/errors"
3232
"github.com/sourcegraph/sourcegraph/schema"
3333
)
@@ -722,7 +722,7 @@ func toTextPatternInfo(b query.Basic, resultTypes result.Types, feat *search.Fea
722722
func toLangFilters(aliases []string) []string {
723723
var filters []string
724724
for _, alias := range aliases {
725-
lang, _ := enry.GetLanguageByAlias(alias) // Invariant: lang is valid.
725+
lang, _ := languages.GetLanguageByAlias(alias) // Invariant: lang is valid.
726726
if !slices.Contains(filters, lang) {
727727
filters = append(filters, lang)
728728
}

internal/search/job/jobutil/job_test.go

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,54 @@ func TestNewPlanJob(t *testing.T) {
12211221
(repoNamePatterns . [(?i)foo bar baz])))
12221222
REPOSCOMPUTEEXCLUDED
12231223
NOOP))))))
1224+
`),
1225+
},
1226+
1227+
// This test shows that we can handle languages not in Linguist
1228+
{
1229+
query: `context:global repo:sourcegraph/.* lang:magik`,
1230+
protocol: search.Streaming,
1231+
searchType: query.SearchTypeKeyword,
1232+
want: autogold.Expect(`
1233+
(LOG
1234+
(ALERT
1235+
(features . error decoding features)
1236+
(protocol . Streaming)
1237+
(onSourcegraphDotCom . true)
1238+
(query . )
1239+
(originalQuery . )
1240+
(patternType . keyword)
1241+
(TIMEOUT
1242+
(timeout . 20s)
1243+
(LIMIT
1244+
(limit . 10000)
1245+
(PARALLEL
1246+
(REPOPAGER
1247+
(containsRefGlobs . false)
1248+
(repoOpts.repoFilters . [sourcegraph/.*])
1249+
(repoOpts.searchContextSpec . global)
1250+
(PARTIALREPOS
1251+
(ZOEKTREPOSUBSETTEXTSEARCH
1252+
(fileMatchLimit . 10000)
1253+
(select . )
1254+
(zoektQueryRegexps . [(?i)(?im:\.MAGIK$)])
1255+
(query . file_regex:"(?i:\\.MAGIK)(?m:$)")
1256+
(type . text))))
1257+
(REPOPAGER
1258+
(containsRefGlobs . false)
1259+
(repoOpts.repoFilters . [sourcegraph/.*])
1260+
(repoOpts.searchContextSpec . global)
1261+
(PARTIALREPOS
1262+
(SEARCHERTEXTSEARCH
1263+
(useFullDeadline . true)
1264+
(patternInfo . TextPatternInfo{//,filematchlimit:10000,lang:magik,f:"(?i)\\.magik$"})
1265+
(numRepos . 0)
1266+
(pathRegexps . [(?i)\.magik$])
1267+
(indexed . false))))
1268+
(REPOSCOMPUTEEXCLUDED
1269+
(repoOpts.repoFilters . [sourcegraph/.*])
1270+
(repoOpts.searchContextSpec . global))
1271+
NOOP)))))
12241272
`),
12251273
},
12261274
}
@@ -1278,6 +1326,7 @@ func TestToTextPatternInfo(t *testing.T) {
12781326
cases := []struct {
12791327
input string
12801328
output autogold.Value
1329+
feat search.Features
12811330
}{{
12821331
input: `type:repo archived`,
12831332
output: autogold.Expect(`{"Query":{"Value":"archived","IsNegated":false,"IsRegExp":false},"IsStructuralPat":false,"CombyRule":"","IsCaseSensitive":false,"FileMatchLimit":30,"Index":"yes","Select":[],"IncludePaths":null,"ExcludePaths":"","IncludeLangs":null,"ExcludeLangs":null,"PathPatternsAreCaseSensitive":false,"PatternMatchesContent":false,"PatternMatchesPath":false,"Languages":null}`),
@@ -1410,9 +1459,17 @@ func TestToTextPatternInfo(t *testing.T) {
14101459
}, {
14111460
input: `repo:^github\.com/sgtest/sourcegraph-typescript$ file:^README\.md "basic :[_] access :[_]" patterntype:structural`,
14121461
output: autogold.Expect(`{"Query":{"Value":"\"basic :[_] access :[_]\"","IsNegated":false,"IsRegExp":false},"IsStructuralPat":true,"CombyRule":"","IsCaseSensitive":false,"FileMatchLimit":30,"Index":"yes","Select":[],"IncludePaths":["^README\\.md"],"ExcludePaths":"","IncludeLangs":null,"ExcludeLangs":null,"PathPatternsAreCaseSensitive":false,"PatternMatchesContent":true,"PatternMatchesPath":true,"Languages":null}`),
1462+
}, {
1463+
input: `sgtest lang:magik type:file`,
1464+
feat: search.Features{ContentBasedLangFilters: true},
1465+
output: autogold.Expect(`{"Query":{"Value":"sgtest","IsNegated":false,"IsRegExp":false},"IsStructuralPat":false,"CombyRule":"","IsCaseSensitive":false,"FileMatchLimit":30,"Index":"yes","Select":[],"IncludePaths":null,"ExcludePaths":"","IncludeLangs":["Magik"],"ExcludeLangs":null,"PathPatternsAreCaseSensitive":false,"PatternMatchesContent":true,"PatternMatchesPath":false,"Languages":["magik"]}`),
1466+
}, {
1467+
input: `sgtest lang:magik type:file`,
1468+
feat: search.Features{ContentBasedLangFilters: false},
1469+
output: autogold.Expect(`{"Query":{"Value":"sgtest","IsNegated":false,"IsRegExp":false},"IsStructuralPat":false,"CombyRule":"","IsCaseSensitive":false,"FileMatchLimit":30,"Index":"yes","Select":[],"IncludePaths":["(?i)\\.magik$"],"ExcludePaths":"","IncludeLangs":null,"ExcludeLangs":null,"PathPatternsAreCaseSensitive":false,"PatternMatchesContent":true,"PatternMatchesPath":false,"Languages":["magik"]}`),
14131470
}}
14141471

1415-
test := func(input string) string {
1472+
test := func(input string, feat search.Features) string {
14161473
searchType := overrideSearchType(input, query.SearchTypeLiteral)
14171474
plan, err := query.Pipeline(query.Init(input, searchType))
14181475
if err != nil {
@@ -1423,14 +1480,14 @@ func TestToTextPatternInfo(t *testing.T) {
14231480
}
14241481
b := plan[0]
14251482
resultTypes := computeResultTypes(b, query.SearchTypeLiteral, defaultResultTypes)
1426-
p := toTextPatternInfo(b, resultTypes, &search.Features{}, limits.DefaultMaxSearchResults)
1483+
p := toTextPatternInfo(b, resultTypes, &feat, limits.DefaultMaxSearchResults)
14271484
v, _ := json.Marshal(p)
14281485
return string(v)
14291486
}
14301487

14311488
for _, tc := range cases {
14321489
t.Run(tc.input, func(t *testing.T) {
1433-
tc.output.Equal(t, test(tc.input))
1490+
tc.output.Equal(t, test(tc.input, tc.feat))
14341491
})
14351492
}
14361493
}

internal/search/query/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ go_library(
2727
"//internal/gitserver/gitdomain",
2828
"//internal/search/filter",
2929
"//internal/search/limits",
30+
"//lib/codeintel/languages",
3031
"//lib/errors",
31-
"@com_github_go_enry_go_enry_v2//:go-enry",
3232
"@com_github_go_enry_go_enry_v2//data",
3333
"@com_github_grafana_regexp//:regexp",
3434
"@com_github_grafana_regexp//syntax",

internal/search/query/helpers.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@ import (
44
"sort"
55
"strings"
66

7-
"github.com/go-enry/go-enry/v2"
87
"github.com/go-enry/go-enry/v2/data"
98
"github.com/grafana/regexp"
9+
10+
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
1011
)
1112

1213
// UnionRegExps separates values with a | operator to create a string
@@ -57,8 +58,8 @@ var filenamesFromLanguage = func() map[string][]string {
5758
// LangToFileRegexp converts a lang: parameter to its corresponding file
5859
// patterns for file filters. The lang value must be valid, cf. validate.go
5960
func LangToFileRegexp(lang string) string {
60-
lang, _ = enry.GetLanguageByAlias(lang) // Invariant: lang is valid.
61-
extensions := enry.GetLanguageExtensions(lang)
61+
lang, _ = languages.GetLanguageByAlias(lang) // Invariant: lang is valid.
62+
extensions := languages.GetLanguageExtensions(lang)
6263
patterns := make([]string, len(extensions))
6364
for i, e := range extensions {
6465
// Add `\.ext$` pattern to match files with the given extension.

internal/search/query/validate.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@ import (
55
"strings"
66
"time"
77

8-
"github.com/go-enry/go-enry/v2"
98
"github.com/grafana/regexp"
109

1110
"github.com/sourcegraph/sourcegraph/internal/gitserver/gitdomain"
1211
"github.com/sourcegraph/sourcegraph/internal/search/filter"
12+
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
1313
"github.com/sourcegraph/sourcegraph/lib/errors"
1414
)
1515

@@ -199,7 +199,7 @@ func validateField(field, value string, negated bool, seen map[string]struct{})
199199
}
200200

201201
isLanguage := func() error {
202-
_, ok := enry.GetLanguageByAlias(value)
202+
_, ok := languages.GetLanguageByAlias(value)
203203
if !ok {
204204
return errors.Errorf("unknown language: %q", value)
205205
}

internal/search/zoekt/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ go_library(
3131
"//internal/search/zoektquery",
3232
"//internal/trace",
3333
"//internal/types",
34+
"//lib/codeintel/languages",
3435
"//lib/errors",
35-
"@com_github_go_enry_go_enry_v2//:go-enry",
3636
"@com_github_grafana_regexp//:regexp",
3737
"@com_github_roaringbitmap_roaring//:roaring",
3838
"@com_github_sourcegraph_log//:log",

internal/search/zoekt/query.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@ package zoekt
33
import (
44
"regexp/syntax" //nolint:depguard // using the grafana fork of regexp clashes with zoekt, which uses the std regexp/syntax.
55

6-
"github.com/go-enry/go-enry/v2"
7-
86
"github.com/sourcegraph/sourcegraph/internal/search"
97
"github.com/sourcegraph/sourcegraph/internal/search/query"
108
"github.com/sourcegraph/sourcegraph/internal/search/result"
119
"github.com/sourcegraph/sourcegraph/internal/search/zoektquery"
10+
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
1211
"github.com/sourcegraph/sourcegraph/lib/errors"
1312

1413
zoekt "github.com/sourcegraph/zoekt/query"
@@ -82,7 +81,7 @@ func QueryToZoektQuery(b query.Basic, resultTypes result.Types, feat *search.Fea
8281
}
8382

8483
func toLangFilter(lang string) zoekt.Q {
85-
lang, _ = enry.GetLanguageByAlias(lang) // Invariant: lang is valid.
84+
lang, _ = languages.GetLanguageByAlias(lang) // Invariant: lang is valid.
8685
return &zoekt.Language{Language: lang}
8786
}
8887

lib/codeintel/languages/extensions.go

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,32 @@ package languages
22

33
import (
44
"path/filepath"
5+
"strings"
56

67
"github.com/go-enry/go-enry/v2"
78
)
89

10+
// getLanguagesByAlias is a replacement for enry.GetLanguagesByAlias
11+
// It supports languages that are missing in go-enry
12+
func GetLanguageByAlias(alias string) (lang string, ok bool) {
13+
normalizedAlias := strings.ToLower(alias)
14+
if lang, ok = unsupportedByEnryAliasMap[normalizedAlias]; ok {
15+
return lang, true
16+
}
17+
18+
return enry.GetLanguageByAlias(normalizedAlias)
19+
}
20+
21+
// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension
22+
// It supports languages that are missing in go-enry
23+
func GetLanguageExtensions(alias string) []string {
24+
if lang, ok := unsupportedByEnryNameToExtensionMap[alias]; ok {
25+
return []string{lang}
26+
}
27+
28+
return enry.GetLanguageExtensions(alias)
29+
}
30+
931
// getLanguagesByExtension is a replacement for enry.GetLanguagesByExtension
1032
// to work around the following limitations:
1133
// - For some extensions which are overwhelmingly used by a certain file type
@@ -17,7 +39,7 @@ func getLanguagesByExtension(path string) (candidates []string, isLikelyBinaryFi
1739
if ext == "" {
1840
return nil, false
1941
}
20-
if lang, ok := unsupportedByEnryExtensionsMap[ext]; ok {
42+
if lang, ok := unsupportedByEnryExtensionToNameMap[ext]; ok {
2143
return []string{lang}, false
2244
}
2345
if _, ok := commonBinaryFileExtensions[ext[1:]]; ok {
@@ -62,13 +84,33 @@ var overrideAmbiguousExtensionsMap = map[string]string{
6284
// for other variants of YAML, hence only 'YAML' is picked by enry.
6385
}
6486

65-
var unsupportedByEnryExtensionsMap = map[string]string{
87+
var unsupportedByEnryExtensionToNameMap = map[string]string{
6688
// Pkl Configuration Language (https://pkl-lang.org/)
89+
// NOTE: Add to linguist on 6/7/24
90+
// can remove once go-enry package updates
91+
// to that linguist version
6792
".pkl": "Pkl",
6893
// Magik Language
6994
".magik": "Magik",
7095
}
7196

97+
var unsupportedByEnryNameToExtensionMap = reverseMap(unsupportedByEnryExtensionToNameMap)
98+
99+
var unsupportedByEnryAliasMap = map[string]string{
100+
// Pkl Configuration Language (https://pkl-lang.org/)
101+
"pkl": "Pkl",
102+
// Magik Language
103+
"magik": "Magik",
104+
}
105+
106+
func reverseMap(m map[string]string) map[string]string {
107+
n := make(map[string]string, len(m))
108+
for k, v := range m {
109+
n[v] = k
110+
}
111+
return n
112+
}
113+
72114
// Source: https://github.com/sindresorhus/binary-extensions/blob/main/binary-extensions.json
73115
// License: https://github.com/sindresorhus/binary-extensions/blob/main/license
74116
// Replace the contents with

0 commit comments

Comments
 (0)