/
spell.c
4434 lines (4021 loc) · 103 KB
/
spell.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* vi:set ts=8 sts=4 sw=4 noet:
*
* VIM - Vi IMproved by Bram Moolenaar
*
* Do ":help uganda" in Vim to read copying and usage conditions.
* Do ":help credits" in Vim to see a list of people who contributed.
* See README.txt for an overview of the Vim source code.
*/
/*
* spell.c: code for spell checking
*
* See spellfile.c for the Vim spell file format.
*
* The spell checking mechanism uses a tree (aka trie). Each node in the tree
* has a list of bytes that can appear (siblings). For each byte there is a
* pointer to the node with the byte that follows in the word (child).
*
* A NUL byte is used where the word may end. The bytes are sorted, so that
* binary searching can be used and the NUL bytes are at the start. The
* number of possible bytes is stored before the list of bytes.
*
* The tree uses two arrays: "byts" stores the characters, "idxs" stores
* either the next index or flags. The tree starts at index 0. For example,
* to lookup "vi" this sequence is followed:
* i = 0
* len = byts[i]
* n = where "v" appears in byts[i + 1] to byts[i + len]
* i = idxs[n]
* len = byts[i]
* n = where "i" appears in byts[i + 1] to byts[i + len]
* i = idxs[n]
* len = byts[i]
* find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
*
* There are two word trees: one with case-folded words and one with words in
* original case. The second one is only used for keep-case words and is
* usually small.
*
* There is one additional tree for when not all prefixes are applied when
* generating the .spl file. This tree stores all the possible prefixes, as
* if they were words. At each word (prefix) end the prefix nr is stored, the
* following word must support this prefix nr. And the condition nr is
* stored, used to lookup the condition that the word must match with.
*
* Thanks to Olaf Seibert for providing an example implementation of this tree
* and the compression mechanism.
* LZ trie ideas:
* http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf
* More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html
*
* Matching involves checking the caps type: Onecap ALLCAP KeepCap.
*
* Why doesn't Vim use aspell/ispell/myspell/etc.?
* See ":help develop-spell".
*/
#define IN_SPELL_C
#include "vim.h"
#if defined(FEAT_SPELL) || defined(PROTO)
#ifndef UNIX // it's in os_unix.h for Unix
# include <time.h> // for time_t
#endif
#define REGION_ALL 0xff // word valid in all regions
// Result values. Lower number is accepted over higher one.
#define SP_BANNED (-1)
#define SP_OK 0
#define SP_RARE 1
#define SP_LOCAL 2
#define SP_BAD 3
/*
* Structure to store info for word matching.
*/
typedef struct matchinf_S
{
langp_T *mi_lp; // info for language and region
// pointers to original text to be checked
char_u *mi_word; // start of word being checked
char_u *mi_end; // end of matching word so far
char_u *mi_fend; // next char to be added to mi_fword
char_u *mi_cend; // char after what was used for
// mi_capflags
// case-folded text
char_u mi_fword[MAXWLEN + 1]; // mi_word case-folded
int mi_fwordlen; // nr of valid bytes in mi_fword
// for when checking word after a prefix
int mi_prefarridx; // index in sl_pidxs with list of
// affixID/condition
int mi_prefcnt; // number of entries at mi_prefarridx
int mi_prefixlen; // byte length of prefix
int mi_cprefixlen; // byte length of prefix in original
// case
// for when checking a compound word
int mi_compoff; // start of following word offset
char_u mi_compflags[MAXWLEN]; // flags for compound words used
int mi_complen; // nr of compound words used
int mi_compextra; // nr of COMPOUNDROOT words
// others
int mi_result; // result so far: SP_BAD, SP_OK, etc.
int mi_capflags; // WF_ONECAP WF_ALLCAP WF_KEEPCAP
win_T *mi_win; // buffer being checked
// for NOBREAK
int mi_result2; // "mi_resul" without following word
char_u *mi_end2; // "mi_end" without following word
} matchinf_T;
static int spell_mb_isword_class(int cl, win_T *wp);
// mode values for find_word
#define FIND_FOLDWORD 0 // find word case-folded
#define FIND_KEEPWORD 1 // find keep-case word
#define FIND_PREFIX 2 // find word after prefix
#define FIND_COMPOUND 3 // find case-folded compound word
#define FIND_KEEPCOMPOUND 4 // find keep-case compound word
static void find_word(matchinf_T *mip, int mode);
static void find_prefix(matchinf_T *mip, int mode);
static int fold_more(matchinf_T *mip);
static void spell_load_cb(char_u *fname, void *cookie);
static int count_syllables(slang_T *slang, char_u *word);
static void clear_midword(win_T *buf);
static void use_midword(slang_T *lp, win_T *buf);
static int find_region(char_u *rp, char_u *region);
static void spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res);
static void spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res);
static void spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res);
static void dump_word(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T lnum);
static linenr_T dump_prefixes(slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T startlnum);
/*
* Main spell-checking function.
* "ptr" points to a character that could be the start of a word.
* "*attrp" is set to the highlight index for a badly spelled word. For a
* non-word or when it's OK it remains unchanged.
* This must only be called when 'spelllang' is not empty.
*
* "capcol" is used to check for a Capitalised word after the end of a
* sentence. If it's zero then perform the check. Return the column where to
* check next, or -1 when no sentence end was found. If it's NULL then don't
* worry.
*
* Returns the length of the word in bytes, also when it's OK, so that the
* caller can skip over the word.
*/
int
spell_check(
win_T *wp, // current window
char_u *ptr,
hlf_T *attrp,
int *capcol, // column to check for Capital
int docount) // count good words
{
matchinf_T mi; // Most things are put in "mi" so that it can
// be passed to functions quickly.
int nrlen = 0; // found a number first
int c;
int wrongcaplen = 0;
int lpi;
int count_word = docount;
int use_camel_case = *wp->w_s->b_p_spo != NUL;
int camel_case = 0;
// A word never starts at a space or a control character. Return quickly
// then, skipping over the character.
if (*ptr <= ' ')
return 1;
// Return here when loading language files failed.
if (wp->w_s->b_langp.ga_len == 0)
return 1;
CLEAR_FIELD(mi);
// A number is always OK. Also skip hexadecimal numbers 0xFF99 and
// 0X99FF. But always do check spelling to find "3GPP" and "11
// julifeest".
if (*ptr >= '0' && *ptr <= '9')
{
if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B'))
mi.mi_end = skipbin(ptr + 2);
else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X'))
mi.mi_end = skiphex(ptr + 2);
else
mi.mi_end = skipdigits(ptr);
nrlen = (int)(mi.mi_end - ptr);
}
// Find the normal end of the word (until the next non-word character).
mi.mi_word = ptr;
mi.mi_fend = ptr;
if (spell_iswordp(mi.mi_fend, wp))
{
int prev_upper;
int this_upper = FALSE; // init for gcc
if (use_camel_case)
{
c = PTR2CHAR(mi.mi_fend);
this_upper = SPELL_ISUPPER(c);
}
do
{
MB_PTR_ADV(mi.mi_fend);
if (use_camel_case)
{
prev_upper = this_upper;
c = PTR2CHAR(mi.mi_fend);
this_upper = SPELL_ISUPPER(c);
camel_case = !prev_upper && this_upper;
}
} while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp)
&& !camel_case);
if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL)
{
// Check word starting with capital letter.
c = PTR2CHAR(ptr);
if (!SPELL_ISUPPER(c))
wrongcaplen = (int)(mi.mi_fend - ptr);
}
}
if (capcol != NULL)
*capcol = -1;
// We always use the characters up to the next non-word character,
// also for bad words.
mi.mi_end = mi.mi_fend;
// Check caps type later.
mi.mi_capflags = 0;
mi.mi_cend = NULL;
mi.mi_win = wp;
// case-fold the word with one non-word character, so that we can check
// for the word end.
if (*mi.mi_fend != NUL)
MB_PTR_ADV(mi.mi_fend);
(void)spell_casefold(wp, ptr, (int)(mi.mi_fend - ptr), mi.mi_fword,
MAXWLEN + 1);
mi.mi_fwordlen = (int)STRLEN(mi.mi_fword);
if (camel_case && mi.mi_fwordlen > 0)
// Introduce a fake word end space into the folded word.
mi.mi_fword[mi.mi_fwordlen - 1] = ' ';
// The word is bad unless we recognize it.
mi.mi_result = SP_BAD;
mi.mi_result2 = SP_BAD;
/*
* Loop over the languages specified in 'spelllang'.
* We check them all, because a word may be matched longer in another
* language.
*/
for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi)
{
mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi);
// If reloading fails the language is still in the list but everything
// has been cleared.
if (mi.mi_lp->lp_slang->sl_fidxs == NULL)
continue;
// Check for a matching word in case-folded words.
find_word(&mi, FIND_FOLDWORD);
// Check for a matching word in keep-case words.
find_word(&mi, FIND_KEEPWORD);
// Check for matching prefixes.
find_prefix(&mi, FIND_FOLDWORD);
// For a NOBREAK language, may want to use a word without a following
// word as a backup.
if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD
&& mi.mi_result2 != SP_BAD)
{
mi.mi_result = mi.mi_result2;
mi.mi_end = mi.mi_end2;
}
// Count the word in the first language where it's found to be OK.
if (count_word && mi.mi_result == SP_OK)
{
count_common_word(mi.mi_lp->lp_slang, ptr,
(int)(mi.mi_end - ptr), 1);
count_word = FALSE;
}
}
if (mi.mi_result != SP_OK)
{
// If we found a number skip over it. Allows for "42nd". Do flag
// rare and local words, e.g., "3GPP".
if (nrlen > 0)
{
if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
return nrlen;
}
// When we are at a non-word character there is no error, just
// skip over the character (try looking for a word after it).
else if (!spell_iswordp_nmw(ptr, wp))
{
if (capcol != NULL && wp->w_s->b_cap_prog != NULL)
{
regmatch_T regmatch;
int r;
// Check for end of sentence.
regmatch.regprog = wp->w_s->b_cap_prog;
regmatch.rm_ic = FALSE;
r = vim_regexec(®match, ptr, 0);
wp->w_s->b_cap_prog = regmatch.regprog;
if (r)
*capcol = (int)(regmatch.endp[0] - ptr);
}
if (has_mbyte)
return (*mb_ptr2len)(ptr);
return 1;
}
else if (mi.mi_end == ptr)
// Always include at least one character. Required for when there
// is a mixup in "midword".
MB_PTR_ADV(mi.mi_end);
else if (mi.mi_result == SP_BAD
&& LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak)
{
char_u *p, *fp;
int save_result = mi.mi_result;
// First language in 'spelllang' is NOBREAK. Find first position
// at which any word would be valid.
mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0);
if (mi.mi_lp->lp_slang->sl_fidxs != NULL)
{
p = mi.mi_word;
fp = mi.mi_fword;
for (;;)
{
MB_PTR_ADV(p);
MB_PTR_ADV(fp);
if (p >= mi.mi_end)
break;
mi.mi_compoff = (int)(fp - mi.mi_fword);
find_word(&mi, FIND_COMPOUND);
if (mi.mi_result != SP_BAD)
{
mi.mi_end = p;
break;
}
}
mi.mi_result = save_result;
}
}
if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
*attrp = HLF_SPB;
else if (mi.mi_result == SP_RARE)
*attrp = HLF_SPR;
else
*attrp = HLF_SPL;
}
if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE))
{
// Report SpellCap only when the word isn't badly spelled.
*attrp = HLF_SPC;
return wrongcaplen;
}
return (int)(mi.mi_end - ptr);
}
/*
* Check if the word at "mip->mi_word" is in the tree.
* When "mode" is FIND_FOLDWORD check in fold-case word tree.
* When "mode" is FIND_KEEPWORD check in keep-case word tree.
* When "mode" is FIND_PREFIX check for word after prefix in fold-case word
* tree.
*
* For a match mip->mi_result is updated.
*/
static void
find_word(matchinf_T *mip, int mode)
{
idx_T arridx = 0;
int endlen[MAXWLEN]; // length at possible word endings
idx_T endidx[MAXWLEN]; // possible word endings
int endidxcnt = 0;
int len;
int wlen = 0;
int flen;
int c;
char_u *ptr;
idx_T lo, hi, m;
char_u *s;
char_u *p;
int res = SP_BAD;
slang_T *slang = mip->mi_lp->lp_slang;
unsigned flags;
char_u *byts;
idx_T *idxs;
int word_ends;
int prefix_found;
int nobreak_result;
if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND)
{
// Check for word with matching case in keep-case tree.
ptr = mip->mi_word;
flen = 9999; // no case folding, always enough bytes
byts = slang->sl_kbyts;
idxs = slang->sl_kidxs;
if (mode == FIND_KEEPCOMPOUND)
// Skip over the previously found word(s).
wlen += mip->mi_compoff;
}
else
{
// Check for case-folded in case-folded tree.
ptr = mip->mi_fword;
flen = mip->mi_fwordlen; // available case-folded bytes
byts = slang->sl_fbyts;
idxs = slang->sl_fidxs;
if (mode == FIND_PREFIX)
{
// Skip over the prefix.
wlen = mip->mi_prefixlen;
flen -= mip->mi_prefixlen;
}
else if (mode == FIND_COMPOUND)
{
// Skip over the previously found word(s).
wlen = mip->mi_compoff;
flen -= mip->mi_compoff;
}
}
if (byts == NULL)
return; // array is empty
/*
* Repeat advancing in the tree until:
* - there is a byte that doesn't match,
* - we reach the end of the tree,
* - or we reach the end of the line.
*/
for (;;)
{
if (flen <= 0 && *mip->mi_fend != NUL)
flen = fold_more(mip);
len = byts[arridx++];
// If the first possible byte is a zero the word could end here.
// Remember this index, we first check for the longest word.
if (byts[arridx] == 0)
{
if (endidxcnt == MAXWLEN)
{
// Must be a corrupted spell file.
emsg(_(e_format_error_in_spell_file));
return;
}
endlen[endidxcnt] = wlen;
endidx[endidxcnt++] = arridx++;
--len;
// Skip over the zeros, there can be several flag/region
// combinations.
while (len > 0 && byts[arridx] == 0)
{
++arridx;
--len;
}
if (len == 0)
break; // no children, word must end here
}
// Stop looking at end of the line.
if (ptr[wlen] == NUL)
break;
// Perform a binary search in the list of accepted bytes.
c = ptr[wlen];
if (c == TAB) // <Tab> is handled like <Space>
c = ' ';
lo = arridx;
hi = arridx + len - 1;
while (lo < hi)
{
m = (lo + hi) / 2;
if (byts[m] > c)
hi = m - 1;
else if (byts[m] < c)
lo = m + 1;
else
{
lo = hi = m;
break;
}
}
// Stop if there is no matching byte.
if (hi < lo || byts[lo] != c)
break;
// Continue at the child (if there is one).
arridx = idxs[lo];
++wlen;
--flen;
// One space in the good word may stand for several spaces in the
// checked word.
if (c == ' ')
{
for (;;)
{
if (flen <= 0 && *mip->mi_fend != NUL)
flen = fold_more(mip);
if (ptr[wlen] != ' ' && ptr[wlen] != TAB)
break;
++wlen;
--flen;
}
}
}
/*
* Verify that one of the possible endings is valid. Try the longest
* first.
*/
while (endidxcnt > 0)
{
--endidxcnt;
arridx = endidx[endidxcnt];
wlen = endlen[endidxcnt];
if ((*mb_head_off)(ptr, ptr + wlen) > 0)
continue; // not at first byte of character
if (spell_iswordp(ptr + wlen, mip->mi_win))
{
if (slang->sl_compprog == NULL && !slang->sl_nobreak)
continue; // next char is a word character
word_ends = FALSE;
}
else
word_ends = TRUE;
// The prefix flag is before compound flags. Once a valid prefix flag
// has been found we try compound flags.
prefix_found = FALSE;
if (mode != FIND_KEEPWORD && has_mbyte)
{
// Compute byte length in original word, length may change
// when folding case. This can be slow, take a shortcut when the
// case-folded word is equal to the keep-case word.
p = mip->mi_word;
if (STRNCMP(ptr, p, wlen) != 0)
{
for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s))
MB_PTR_ADV(p);
wlen = (int)(p - mip->mi_word);
}
}
// Check flags and region. For FIND_PREFIX check the condition and
// prefix ID.
// Repeat this if there are more flags/region alternatives until there
// is a match.
res = SP_BAD;
for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0;
--len, ++arridx)
{
flags = idxs[arridx];
// For the fold-case tree check that the case of the checked word
// matches with what the word in the tree requires.
// For keep-case tree the case is always right. For prefixes we
// don't bother to check.
if (mode == FIND_FOLDWORD)
{
if (mip->mi_cend != mip->mi_word + wlen)
{
// mi_capflags was set for a different word length, need
// to do it again.
mip->mi_cend = mip->mi_word + wlen;
mip->mi_capflags = captype(mip->mi_word, mip->mi_cend);
}
if (mip->mi_capflags == WF_KEEPCAP
|| !spell_valid_case(mip->mi_capflags, flags))
continue;
}
// When mode is FIND_PREFIX the word must support the prefix:
// check the prefix ID and the condition. Do that for the list at
// mip->mi_prefarridx that find_prefix() filled.
else if (mode == FIND_PREFIX && !prefix_found)
{
c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx,
flags,
mip->mi_word + mip->mi_cprefixlen, slang,
FALSE);
if (c == 0)
continue;
// Use the WF_RARE flag for a rare prefix.
if (c & WF_RAREPFX)
flags |= WF_RARE;
prefix_found = TRUE;
}
if (slang->sl_nobreak)
{
if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND)
&& (flags & WF_BANNED) == 0)
{
// NOBREAK: found a valid following word. That's all we
// need to know, so return.
mip->mi_result = SP_OK;
break;
}
}
else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND
|| !word_ends))
{
// If there is no compound flag or the word is shorter than
// COMPOUNDMIN reject it quickly.
// Makes you wonder why someone puts a compound flag on a word
// that's too short... Myspell compatibility requires this
// anyway.
if (((unsigned)flags >> 24) == 0
|| wlen - mip->mi_compoff < slang->sl_compminlen)
continue;
// For multi-byte chars check character length against
// COMPOUNDMIN.
if (has_mbyte
&& slang->sl_compminlen > 0
&& mb_charlen_len(mip->mi_word + mip->mi_compoff,
wlen - mip->mi_compoff) < slang->sl_compminlen)
continue;
// Limit the number of compound words to COMPOUNDWORDMAX if no
// maximum for syllables is specified.
if (!word_ends && mip->mi_complen + mip->mi_compextra + 2
> slang->sl_compmax
&& slang->sl_compsylmax == MAXWLEN)
continue;
// Don't allow compounding on a side where an affix was added,
// unless COMPOUNDPERMITFLAG was used.
if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF))
continue;
if (!word_ends && (flags & WF_NOCOMPAFT))
continue;
// Quickly check if compounding is possible with this flag.
if (!byte_in_str(mip->mi_complen == 0
? slang->sl_compstartflags
: slang->sl_compallflags,
((unsigned)flags >> 24)))
continue;
// If there is a match with a CHECKCOMPOUNDPATTERN rule
// discard the compound word.
if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat))
continue;
if (mode == FIND_COMPOUND)
{
int capflags;
// Need to check the caps type of the appended compound
// word.
if (has_mbyte && STRNCMP(ptr, mip->mi_word,
mip->mi_compoff) != 0)
{
// case folding may have changed the length
p = mip->mi_word;
for (s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s))
MB_PTR_ADV(p);
}
else
p = mip->mi_word + mip->mi_compoff;
capflags = captype(p, mip->mi_word + wlen);
if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP
&& (flags & WF_FIXCAP) != 0))
continue;
if (capflags != WF_ALLCAP)
{
// When the character before the word is a word
// character we do not accept a Onecap word. We do
// accept a no-caps word, even when the dictionary
// word specifies ONECAP.
MB_PTR_BACK(mip->mi_word, p);
if (spell_iswordp_nmw(p, mip->mi_win)
? capflags == WF_ONECAP
: (flags & WF_ONECAP) != 0
&& capflags != WF_ONECAP)
continue;
}
}
// If the word ends the sequence of compound flags of the
// words must match with one of the COMPOUNDRULE items and
// the number of syllables must not be too large.
mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24);
mip->mi_compflags[mip->mi_complen + 1] = NUL;
if (word_ends)
{
char_u fword[MAXWLEN];
if (slang->sl_compsylmax < MAXWLEN)
{
// "fword" is only needed for checking syllables.
if (ptr == mip->mi_word)
(void)spell_casefold(mip->mi_win,
ptr, wlen, fword, MAXWLEN);
else
vim_strncpy(fword, ptr, endlen[endidxcnt]);
}
if (!can_compound(slang, fword, mip->mi_compflags))
continue;
}
else if (slang->sl_comprules != NULL
&& !match_compoundrule(slang, mip->mi_compflags))
// The compound flags collected so far do not match any
// COMPOUNDRULE, discard the compounded word.
continue;
}
// Check NEEDCOMPOUND: can't use word without compounding.
else if (flags & WF_NEEDCOMP)
continue;
nobreak_result = SP_OK;
if (!word_ends)
{
int save_result = mip->mi_result;
char_u *save_end = mip->mi_end;
langp_T *save_lp = mip->mi_lp;
int lpi;
// Check that a valid word follows. If there is one and we
// are compounding, it will set "mi_result", thus we are
// always finished here. For NOBREAK we only check that a
// valid word follows.
// Recursive!
if (slang->sl_nobreak)
mip->mi_result = SP_BAD;
// Find following word in case-folded tree.
mip->mi_compoff = endlen[endidxcnt];
if (has_mbyte && mode == FIND_KEEPWORD)
{
// Compute byte length in case-folded word from "wlen":
// byte length in keep-case word. Length may change when
// folding case. This can be slow, take a shortcut when
// the case-folded word is equal to the keep-case word.
p = mip->mi_fword;
if (STRNCMP(ptr, p, wlen) != 0)
{
for (s = ptr; s < ptr + wlen; MB_PTR_ADV(s))
MB_PTR_ADV(p);
mip->mi_compoff = (int)(p - mip->mi_fword);
}
}
#if 0 // Disabled, see below
c = mip->mi_compoff;
#endif
++mip->mi_complen;
if (flags & WF_COMPROOT)
++mip->mi_compextra;
// For NOBREAK we need to try all NOBREAK languages, at least
// to find the ".add" file(s).
for (lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi)
{
if (slang->sl_nobreak)
{
mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi);
if (mip->mi_lp->lp_slang->sl_fidxs == NULL
|| !mip->mi_lp->lp_slang->sl_nobreak)
continue;
}
find_word(mip, FIND_COMPOUND);
// When NOBREAK any word that matches is OK. Otherwise we
// need to find the longest match, thus try with keep-case
// and prefix too.
if (!slang->sl_nobreak || mip->mi_result == SP_BAD)
{
// Find following word in keep-case tree.
mip->mi_compoff = wlen;
find_word(mip, FIND_KEEPCOMPOUND);
#if 0 // Disabled, a prefix must not appear halfway a compound word,
// unless the COMPOUNDPERMITFLAG is used and then it can't be a
// postponed prefix.
if (!slang->sl_nobreak || mip->mi_result == SP_BAD)
{
// Check for following word with prefix.
mip->mi_compoff = c;
find_prefix(mip, FIND_COMPOUND);
}
#endif
}
if (!slang->sl_nobreak)
break;
}
--mip->mi_complen;
if (flags & WF_COMPROOT)
--mip->mi_compextra;
mip->mi_lp = save_lp;
if (slang->sl_nobreak)
{
nobreak_result = mip->mi_result;
mip->mi_result = save_result;
mip->mi_end = save_end;
}
else
{
if (mip->mi_result == SP_OK)
break;
continue;
}
}
if (flags & WF_BANNED)
res = SP_BANNED;
else if (flags & WF_REGION)
{
// Check region.
if ((mip->mi_lp->lp_region & (flags >> 16)) != 0)
res = SP_OK;
else
res = SP_LOCAL;
}
else if (flags & WF_RARE)
res = SP_RARE;
else
res = SP_OK;
// Always use the longest match and the best result. For NOBREAK
// we separately keep the longest match without a following good
// word as a fall-back.
if (nobreak_result == SP_BAD)
{
if (mip->mi_result2 > res)
{
mip->mi_result2 = res;
mip->mi_end2 = mip->mi_word + wlen;
}
else if (mip->mi_result2 == res
&& mip->mi_end2 < mip->mi_word + wlen)
mip->mi_end2 = mip->mi_word + wlen;
}
else if (mip->mi_result > res)
{
mip->mi_result = res;
mip->mi_end = mip->mi_word + wlen;
}
else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen)
mip->mi_end = mip->mi_word + wlen;
if (mip->mi_result == SP_OK)
break;
}
if (mip->mi_result == SP_OK)
break;
}
}
/*
* Return TRUE if there is a match between the word ptr[wlen] and
* CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another
* word.
* A match means that the first part of CHECKCOMPOUNDPATTERN matches at the
* end of ptr[wlen] and the second part matches after it.
*/
int
match_checkcompoundpattern(
char_u *ptr,
int wlen,
garray_T *gap) // &sl_comppat
{
int i;
char_u *p;
int len;
for (i = 0; i + 1 < gap->ga_len; i += 2)
{
p = ((char_u **)gap->ga_data)[i + 1];
if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0)
{
// Second part matches at start of following compound word, now
// check if first part matches at end of previous word.
p = ((char_u **)gap->ga_data)[i];
len = (int)STRLEN(p);
if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0)
return TRUE;
}
}
return FALSE;
}
/*
* Return TRUE if "flags" is a valid sequence of compound flags and "word"
* does not have too many syllables.
*/
int
can_compound(slang_T *slang, char_u *word, char_u *flags)
{
char_u uflags[MAXWLEN * 2];
int i;
char_u *p;
if (slang->sl_compprog == NULL)
return FALSE;
if (enc_utf8)
{
// Need to convert the single byte flags to utf8 characters.
p = uflags;
for (i = 0; flags[i] != NUL; ++i)
p += utf_char2bytes(flags[i], p);
*p = NUL;
p = uflags;
}
else
p = flags;
if (!vim_regexec_prog(&slang->sl_compprog, FALSE, p, 0))
return FALSE;
// Count the number of syllables. This may be slow, do it last. If there
// are too many syllables AND the number of compound words is above
// COMPOUNDWORDMAX then compounding is not allowed.
if (slang->sl_compsylmax < MAXWLEN
&& count_syllables(slang, word) > slang->sl_compsylmax)
return (int)STRLEN(flags) < slang->sl_compmax;
return TRUE;
}
/*
* Return TRUE if the compound flags in compflags[] match the start of any
* compound rule. This is used to stop trying a compound if the flags
* collected so far can't possibly match any compound rule.
* Caller must check that slang->sl_comprules is not NULL.
*/
int
match_compoundrule(slang_T *slang, char_u *compflags)
{
char_u *p;
int i;
int c;
// loop over all the COMPOUNDRULE entries
for (p = slang->sl_comprules; *p != NUL; ++p)
{
// loop over the flags in the compound word we have made, match
// them against the current rule entry
for (i = 0; ; ++i)
{
c = compflags[i];
if (c == NUL)
// found a rule that matches for the flags we have so far
return TRUE;
if (*p == '/' || *p == NUL)
break; // end of rule, it's too short
if (*p == '[')
{
int match = FALSE;
// compare against all the flags in []