Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing C Runtime Functions #5

Open
mingodad opened this issue Jul 28, 2021 · 5 comments
Open

Missing C Runtime Functions #5

mingodad opened this issue Jul 28, 2021 · 5 comments

Comments

@mingodad
Copy link

Trying to port lua regular expression patterns to jancy I needed the following missing C Runtime Functions that I think would benefit others:

isalpha
iscntrl
isdigit
isgraph
islower
ispunct
isspace
isupper
isalnum
isxdigit
@mingodad
Copy link
Author

Also missing math functions and info about limits of library types defined in header <stdint.h>:

CHAR_BIT
 
number of bits in a byte 
(macro constant)

MB_LEN_MAX
 
maximum number of bytes in a multibyte character 
(macro constant)

CHAR_MIN
 
minimum value of char 
(macro constant)

CHAR_MAX
 
maximum value of char 
(macro constant)

SCHAR_MIN
SHRT_MIN
INT_MIN
LONG_MIN
LLONG_MIN
  
(C99)
 
minimum value of signed char, short, int, long and long long respectively 
(macro constant)

SCHAR_MAX
SHRT_MAX
INT_MAX
LONG_MAX
LLONG_MAX
  
(C99)
 
maximum value of signed char, short, int, long and long long respectively 
(macro constant)

UCHAR_MAX
USHRT_MAX
UINT_MAX
ULONG_MAX
ULLONG_MAX
  
(C99)
 
maximum value of unsigned char, unsigned short, unsigned int,
unsigned long and unsigned long long respectively 
(macro constant)
...

@mingodad
Copy link
Author

My lua-regex.jnc so far (to show what I'm trying to achieve) that has this error:

jancy "lua-regex.jnc"
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(303,50): binary '+' cannot be applied to 'char [9]' and 'char*'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(182,48): unexpected 'identifier' in 'literal'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): undeclared identifier 'INT_MAX'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): unable to recover from previous error(s)
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(42,1172): binary '+' cannot be applied to 'char [2]' and 'char const*'
5 error(s); compilation failed

lua-regex.jnc

/*
** maximum number of captures that a pattern can do during
** pattern-matching. This limit is arbitrary.
*/

alias ptrdiff_t = intptr_t;

exposed enum LuaRegexConsts
{
    LUA_REGEX_MAXCAPTURES = 32,
    CAP_UNFINISHED = -1,
    CAP_POSITION = -2,
    L_ESC = '%',
}

struct LuaCapture {
    const char *init;
    intptr_t len;
};

struct LuaMatchState {
  const char *src_init;  /* init of source string */
  const char *src_end;  /* end ('\0') of source string */
  const char *p_end;  /* end ('\0') of pattern */
  size_t   start_pos;  /* pattern match start position */
  size_t   end_pos;    /* pattern match end position */
  const char *error;
  int level;  /* total number of captures (finished or unfinished) */
  LuaCapture capture[LUA_REGEX_MAXCAPTURES];
};

typedef int luaregex_func_param(LuaMatchState *ms, const void *udata, void **b);


/* macro to `unsign' a character */
//#define uchar(c)    ((unsigned char)(c))
static unsigned char uchar(char c) {return (unsigned char)(c);};

//static const char L_ESC = '%';
static const char SPECIALS[]  =  "^$*+?.([%-";

static char *LUA_QL(const char *x)  {return  "'" + x + "'"; }
static char *LUA_QS() {return LUA_QL("%s");}

static intptr_t posrelat (intptr_t pos, size_t len) {
  /* relative string position: negative means back from end */
  if (pos < 0) pos += len;
  return (pos >= 0) ? pos : 0;
}

static int check_capture_all_closed (LuaMatchState *ms) {
  int i;
  for(i=0; i<ms->level; ++i){
      if(ms->capture[i].len == CAP_UNFINISHED){
          ms->error = "unfinished capture";
          return 0;
      }
  }
  return 1;
}

static int check_capture_is_closed (LuaMatchState *ms, int l) {
  if (l < 0 || l >= ms->level){
      ms->error = "invalid capture index";
      return 0;
  }
  if (ms->capture[l].len == CAP_UNFINISHED){
      ms->error = "unfinished capture";
      return 0;
  }
  return 1;
}

static int check_capture (LuaMatchState *ms, int *l_out) {
  int l;
  *l_out -= '1';
  l = *l_out;
  return check_capture_is_closed(ms, l);
}

static int capture_to_close (LuaMatchState *ms, int *level_out) {
  int level = ms->level;
  for (level--; level>=0; level--)
    if (ms->capture[level].len == CAP_UNFINISHED) {
        *level_out = level;
        return 1;
    }
  ms->error = "invalid pattern capture";
  return 0;
}

static int classend (LuaMatchState *ms, const char *p, const char **result) {
  switch (*p++) {
    case L_ESC: {
      if (p == ms->p_end){
          ms->error = "malformed pattern (ends with " LUA_QL("%%") ")";
          return 0;
      }
      *result = p+1;
      return 1;
    }
    case '[': {
      if (*p == '^') p++;
      do {  /* look for a `]' */
        if (p == ms->p_end){
            ms->error = "malformed pattern (missing " LUA_QL("]") ")";
            return 0;
        }
        if (*(p++) == L_ESC && p < ms->p_end)
          p++;  /* skip escapes (e.g. `%]') */
      } while (*p != ']');
      *result = p+1;
      return 1;
    }
    default: {
      *result = p;
      return 1;
    }
  }
}

static int isalpha(int c) { return (c >= 'a' && c <= 'z') || (c <= 'A' && c >= 'Z');}

static int match_class (int c, int cl) {
  int res;
  switch (tolower(cl)) {
    case 'a' : res = isalpha(c); break;
    case 'c' : res = iscntrl(c); break;
    case 'd' : res = isdigit(c); break;
    case 'g' : res = isgraph(c); break;
    case 'l' : res = islower(c); break;
    case 'p' : res = ispunct(c); break;
    case 's' : res = isspace(c); break;
    case 'u' : res = isupper(c); break;
    case 'w' : res = isalnum(c); break;
    case 'x' : res = isxdigit(c); break;
    case 'z' : res = (c == 0); break;  /* deprecated option */
    default: return (cl == c);
  }
  return (islower(cl) ? res : !res);
}

static int matchbracketclass (int c, const char *p, const char *ec) {
  int sig = 1;
  if (*(p+1) == '^') {
    sig = 0;
    p++;  /* skip the `^' */
  }
  while (++p < ec) {
    if (*p == L_ESC) {
      p++;
      if (match_class(c, uchar(*p)))
        return sig;
    }
    else if ((*(p+1) == '-') && (p+2 < ec)) {
      p+=2;
      if (uchar(*(p-2)) <= c && c <= uchar(*p))
        return sig;
    }
    else if (uchar(*p) == c) return sig;
  }
  return !sig;
}

static int singlematch (int c, const char *p, const char *ep) {
  switch (*p) {
    case '.': return 1;  /* matches any char */
    case L_ESC: return match_class(c, uchar(*(p+1)));
    case '[': return matchbracketclass(c, p, ep-1);
    default:  return (uchar(*p) == c);
  }
}

//static const char *match (LuaMatchState *ms, const char *s, const char *p);

//add escape char extension from https://github.com/jcgoble3/lua-matchext
static const char *matchbalance (LuaMatchState *ms, const char *s,
                                   const char *p) {
  int escaped = (*(p-1) == 'B'); /* EXT */
  if (p >= ms->p_end - 1 - escaped){
    ms->error = "malformed pattern "
                      "(missing arguments to " LUA_QL("%%b") ")";
    return null;
  }
  if (*s != *p) return null;
  else {
    int b = *p;
    int e = *(p + (escaped ? 2 : 1));  /* EXT */
    int esc = escaped ? *(p + 1) : INT_MAX;  /* EXT */
    int cont = 1;
    while (++s < ms->src_end) {
      if (*s == esc) s++; /* EXT */
      else if (*s == e) {
        if (--cont == 0) return s+1;
      }
      else if (*s == b) cont++;
    }
  }
  return null;  /* string ends out of balance */
}


static const char *max_expand (LuaMatchState *ms, const char *s,
                                 const char *p, const char *ep) {
  ptrdiff_t i = 0;  /* counts maximum expand for item */
  while ((s+i)<ms->src_end && singlematch(uchar(*(s+i)), p, ep))
    i++;
  /* keeps trying to match with the maximum repetitions */
  while (i>=0) {
    const char *res = match(ms, (s+i), ep+1);
    if (res) return res;
    i--;  /* else didn't match; reduce 1 repetition to try again */
  }
  return null;
}


static const char *min_expand (LuaMatchState *ms, const char *s,
                                 const char *p, const char *ep) {
  for (;;) {
    const char *res = match(ms, s, ep+1);
    if (res != null)
      return res;
    else if (s<ms->src_end && singlematch(uchar(*s), p, ep))
      s++;  /* try with one more repetition */
    else return null;
  }
}


static const char *start_capture (LuaMatchState *ms, const char *s,
                                    const char *p, int what) {
  const char *res;
  int level = ms->level;
  if (level >= LUA_REGEX_MAXCAPTURES) {
      ms->error = "too many captures";
      return null;
  }
  ms->capture[level].init = s;
  ms->capture[level].len = what;
  ms->level = level+1;
  if ((res=match(ms, s, p)) == null)  /* match failed? */
    ms->level--;  /* undo capture */
  return res;
}


static const char *end_capture (LuaMatchState *ms, const char *s,
                                  const char *p) {
  int l;
  const char *res;
  if(!capture_to_close(ms, &l)) return null;
  ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
  if ((res = match(ms, s, p)) == null)  /* match failed? */
    ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
  return res;
}


static const char *match_capture (LuaMatchState *ms, const char *s, int l) {
  size_t len;
  if(check_capture(ms, &l)){
      len = ms->capture[l].len;
      if ((size_t)(ms->src_end-s) >= len &&
          memcmp(ms->capture[l].init, s, len) == 0)
        return s+len;
  }
  return null;
}


static const char *match (LuaMatchState *ms, const char *s, const char *p) {
  //init: /* using goto's to optimize tail recursion */
  for(;;) {
  if (p == ms->p_end)  /* end of pattern? */
    return s;  /* match succeeded */
  switch (*p) {
    case '(': {  /* start capture */
      if (*(p+1) == ')')  /* position capture? */
        return start_capture(ms, s, p+2, CAP_POSITION);
      else
        return start_capture(ms, s, p+1, CAP_UNFINISHED);
    }
    case ')': {  /* end capture */
      return end_capture(ms, s, p+1);
    }
    case '$': {
      if ((p+1) == ms->p_end)  /* is the `$' the last char in pattern? */
        return (s == ms->src_end) ? s : null;  /* check end of string */
      else break; //goto dflt;
    }
    case L_ESC: {  /* escaped sequences not in the format class[*+?-]? */
      switch (*(p+1)) {
        case 'b': case 'B': { /* balanced string? */ /* EXT */
          s = matchbalance(ms, s, p+2);
          if (s == null) return null;
          p += (*(p + 1) == 'b') ? 4 : 5; /* EXT */ continue; // goto init;  /* else return match(ms, s, p+4); */
        }
        case 'f': {  /* frontier? */
          const char *ep; char previous;
          p += 2;
          if (*p != '['){
            ms->error = "missing " + LUA_QL("[") + " after " +
                               LUA_QL("%%f") + " in pattern";
            return null;
          }
          if(!classend(ms, p, &ep)) return null;  /* points to what is next */
          previous = (s == ms->src_init) ? '\0' : *(s-1);
          if (matchbracketclass(uchar(previous), p, ep-1) ||
             !matchbracketclass(uchar(*s), p, ep-1)) return null;
          p=ep; continue; //goto init;  /* else return match(ms, s, ep); */
        }
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
        case '8': case '9': {  /* capture results (%0-%9)? */
          s = match_capture(ms, s, uchar(*(p+1)));
          if (s == null) return null;
          p+=2; continue; //goto init;  /* else return match(ms, s, p+2) */
        }
        //default: goto dflt;
      }
    }
  }
    //default: dflt:
    {  /* pattern class plus optional suffix */
      const char *ep;
	  int m;
      if(!classend(ms, p, &ep)) return null;  /* points to what is next */
      m = s < ms->src_end && singlematch(uchar(*s), p, ep);
      switch (*ep) {
        case '?': {  /* optional */
          const char *res;
          if (m && ((res=match(ms, s+1, ep+1)) != null))
            return res;
          p=ep+1; continue; //goto init;  /* else return match(ms, s, ep+1); */
        }
        case '*': {  /* 0 or more repetitions */
          return max_expand(ms, s, p, ep);
        }
        case '+': {  /* 1 or more repetitions */
          return (m ? max_expand(ms, s+1, p, ep) : null);
        }
        case '-': {  /* 0 or more repetitions (minimum) */
          return min_expand(ms, s, p, ep);
        }
        default: {
          if (!m) return null;
          s++; p=ep; continue; //goto init;  /* else return match(ms, s+1, ep); */
        }
      }
    }
  break;
  }
}


static const char *lmemfind (const char *s1, size_t l1,
                               const char *s2, size_t l2) {
  if (l2 == 0) return s1;  /* empty strings are everywhere */
  else if (l2 > l1) return null;  /* avoids a negative `l1' */
  else {
    const char *init;  /* to search for a `*s2' inside `s1' */
    l2--;  /* 1st char will be checked by `memchr' */
    l1 = l1-l2;  /* `s2' cannot be found after that */
    while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != null) {
      init++;   /* 1st char is already checked */
      if (memcmp(init, s2+1, l2) == 0)
        return init-1;
      else {  /* correct `l1' and `s1' to try again */
        l1 -= init-s1;
        s1 = init;
      }
    }
    return null;  /* not found */
  }
}


/* check whether pattern has no special characters */
static int nospecials (const char *p, size_t l) {
  size_t upto = 0;
  do {
    if (strpbrk(p + upto, SPECIALS))
      return 0;  /* pattern has a special character */
    upto += strlen(p + upto) + 1;  /* may have more after \0 */
  } while (upto <= l);
  return 1;  /* no special chars found */
}


static ptrdiff_t str_find_aux (LuaMatchState *ms, int find, const char *s, ptrdiff_t ls,
                         const char *p, ptrdiff_t lp, ptrdiff_t init, int raw_find,
                         luaregex_func_param *fp, void *udata) {
  ptrdiff_t result;
  ms->error = null;
  if(ls < 0) ls = strlen(s);
  assert(ls >= 0);
  if(lp < 0) lp = strlen(p);
  assert(lp >= 0);
  init = posrelat(init, ls);
  if (init < 0) init = 0;
  else if (init > ls + 1) {  /* start after string's end? */
    return 0; /* cannot find anything */
  }
  ms->src_init = s;
  ms->src_end = s + ls;

//do_again:
for(;;) {
  result = -1; /* not found */
  /* explicit request or no special characters? */
  if (find && (raw_find || nospecials(p, lp))) {
    /* do a plain search */
    const char *s2 = lmemfind(s + init, ls - init, p, lp);
    if (s2) {
      ms->start_pos = ((int)(s2 - s));
      result = ms->end_pos = ms->start_pos+lp;
      ms->level = 0;
    }
  }
  else {
    const char *s1 = s + init;
    int anchor = (*p == '^');
    if (anchor) {
      p++; lp--;  /* skip anchor character */
    }
    ms->p_end = p + lp;
    do {
      const char *res;
      ms->level = 0;
      if ((res=match(ms, s1, p)) != null) {
          ms->start_pos = s1-s;
          result = ms->end_pos = res-s;
          break; //goto eofunc;
      }
    } while (s1++ < ms->src_end && !anchor);
  }
//eofunc:

  if(result >= 0){
      if(!check_capture_all_closed(ms)) return 0;
      if(fp && fp(ms, udata, null)) {
          init = result;
          if (init == ms->start_pos) ++init;  /* empty match? go at least one position */
          if (init < ls) continue; //goto do_again;
      }
  }

  break;
}

  return result > 0 ? ms->start_pos : result; //returning the start position
}

int main ()
{
	LuaMatchState ms;

	printf ("lua-regex!\n");
	printf("%d\n", posrelat(-10, 12));

	printf("match_class : %d\n", match_class('f', 'x'));

	char const* p1 = " foo bar 100 baz";
	const char * p2 = "baz";
	const char *found = lmemfind(p1, strlen(p1), p2, strlen(p2));

	printf("found : %s\n", found);

	ptrdiff_t dt = str_find_aux(&ms, 1, p1, strlen(p1), p2, strlen(p2), 0, 1, null, null);
	printf("found : %d\n", dt);

	return 0;
}

@mingodad
Copy link
Author

And here is my initial implementation of isalpha, ...:

diff --git a/src/jnc_ext/jnc_std/jnc/std_globals.jnc b/src/jnc_ext/jnc_std/jnc/std_globals.jnc
index 951cc88a..079a7e3c 100644
--- a/src/jnc_ext/jnc_std/jnc/std_globals.jnc
+++ b/src/jnc_ext/jnc_std/jnc/std_globals.jnc
@@ -442,6 +442,17 @@ intptr_t cdecl printf(
 	...
 	);
 
+bool isalpha(uint32_t c);
+bool iscntrl(uint32_t c);
+bool isdigit(uint32_t c);
+bool isgraph(uint32_t c);
+bool islower(uint32_t c);
+bool ispunct(uint32_t c);
+bool isspace(uint32_t c);
+bool isupper(uint32_t c);
+bool isalnum(uint32_t c);
+bool isxdigit(uint32_t c);
+
 //! @}
 
 namespace std {
diff --git a/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp b/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
index 92bdfa16..003a871f 100644
--- a/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
+++ b/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
@@ -148,6 +148,56 @@ strtoul(
 	return strtot<uint64_t>(::_strtoui64, ptr, endPtr, radix);
 }
 
+bool isAlpha(uint32_t c)
+{
+    return enc::utfIsLetter(c);
+}
+
+bool isCntrl(uint32_t c)
+{
+    return iscntrl(c);
+}
+
+bool isDigit(uint32_t c)
+{
+    return enc::utfIsDigit(c);
+}
+
+bool isGraph(uint32_t c)
+{
+    return isgraph(c);
+}
+
+bool isLower(uint32_t c)
+{
+    return enc::utfIsLowerCase(c);
+}
+
+bool isPunct(uint32_t c)
+{
+    return enc::utfIsPunctuation(c);
+}
+
+bool isSpace(uint32_t c)
+{
+    return enc::utfIsSpace(c);
+}
+
+bool isUpper(uint32_t c)
+{
+    return enc::utfIsUpperCase(c);
+}
+
+bool isAlnum(uint32_t c)
+{
+    return enc::utfIsLetterOrDigit(c);
+}
+
+bool isXdigit(uint32_t c)
+{
+    return isxdigit(c);
+}
+
 uint32_t
 toUpper(uint32_t c)
 {
@@ -679,6 +729,17 @@ JNC_BEGIN_LIB_FUNCTION_MAP(jnc_StdLib)
 	JNC_MAP_OVERLOAD(setError_1)
 	JNC_MAP_FUNCTION("std.format",       format)
 
+	JNC_MAP_FUNCTION("isalpha",   isAlpha)
+	JNC_MAP_FUNCTION("iscntrl",   isCntrl)
+	JNC_MAP_FUNCTION("isdigit",   isDigit)
+	JNC_MAP_FUNCTION("isgraph",   isGraph)
+	JNC_MAP_FUNCTION("islower",   isLower)
+	JNC_MAP_FUNCTION("ispunct",   isPunct)
+	JNC_MAP_FUNCTION("isspace",   isSpace)
+	JNC_MAP_FUNCTION("isupper",   isUpper)
+	JNC_MAP_FUNCTION("isalnum",   isAlnum)
+	JNC_MAP_FUNCTION("isxdigit",  isXdigit)
+
 	JNC_MAP_FUNCTION("strlen",   jnc::strLen)
 	JNC_MAP_FUNCTION("strcmp",   strCmp)
 	JNC_MAP_FUNCTION("strncmp",  strnCmp)

@vovkos
Copy link
Owner

vovkos commented Jul 31, 2021

My lua-regex.jnc so far (to show what I'm trying to achieve) that has this error:

jancy "lua-regex.jnc"
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(303,50): binary '+' cannot be applied to 'char [9]' and 'char*'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(182,48): unexpected 'identifier' in 'literal'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): undeclared identifier 'INT_MAX'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): unable to recover from previous error(s)
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(42,1172): binary '+' cannot be applied to 'char [2]' and 'char const*'
5 error(s); compilation failed

You are trying to add char pointers/char arrays, and that doesn't work -- just like in C.

For building strings please use:

@vovkos
Copy link
Owner

vovkos commented Jul 31, 2021

And here is my initial implementation of isalpha, ...:

A PR with those standard C runtime functions would be very welcome.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants