Patch for mc git version (4.8.0, 2011-04-03):
Enables use of escape sequences inside regex replace strings,
Enables UTF-8 caseless search in PCRE.
Supported escape sequences: \DEC, \xHEX, \0OCT, \n, \t, \v,
\b, \r, \f, \a. Any of them could be enclosed into \{}.
diff --git a/lib/search/regex.c b/lib/search/regex.c
index 6917972..e63f3cc 100644
a
|
b
|
static int |
378 | 378 | mc_search_regex__process_replace_str (const GString * replace_str, const gsize current_pos, |
379 | 379 | gsize * skip_len, replace_transform_type_t * replace_flags) |
380 | 380 | { |
381 | | int ret = -1; |
| 381 | int ret = -1; /* nothing special */ |
382 | 382 | char *tmp_str; |
383 | 383 | const char *curr_str = &(replace_str->str[current_pos]); |
384 | 384 | |
… |
… |
mc_search_regex__process_replace_str (const GString * replace_str, const gsize c |
387 | 387 | |
388 | 388 | *skip_len = 0; |
389 | 389 | |
390 | | if (*curr_str == '$' && *(curr_str + 1) == '{' && (*(curr_str + 2) & (char) 0xf0) == 0x30) |
| 390 | if (replace_str->len > current_pos + 2 && |
| 391 | *curr_str == '$' && |
| 392 | *(curr_str + 1) == '{' && |
| 393 | (*(curr_str + 2) & (char) 0xf0) == 0x30) |
391 | 394 | { |
392 | 395 | if (strutils_is_char_escaped (replace_str->str, curr_str)) |
393 | 396 | { |
394 | 397 | *skip_len = 1; |
395 | | return -1; |
| 398 | return -1; /* nothing special */ |
396 | 399 | } |
397 | 400 | |
398 | 401 | for (*skip_len = 0; |
… |
… |
mc_search_regex__process_replace_str (const GString * replace_str, const gsize c |
400 | 403 | && (*(curr_str + 2 + *skip_len) & (char) 0xf0) == 0x30; (*skip_len)++); |
401 | 404 | |
402 | 405 | if (*(curr_str + 2 + *skip_len) != '}') |
403 | | return -1; |
| 406 | return -1; /* nothing special */ |
404 | 407 | |
405 | 408 | tmp_str = g_strndup (curr_str + 2, *skip_len); |
406 | 409 | if (tmp_str == NULL) |
407 | | return -1; |
| 410 | return -1; /* nothing special */ |
408 | 411 | |
409 | 412 | ret = atoi (tmp_str); |
410 | 413 | g_free (tmp_str); |
411 | 414 | |
412 | 415 | *skip_len += 3; /* ${} */ |
413 | | return ret; |
| 416 | return ret; /* capture buffer index >= 0 */ |
414 | 417 | } |
415 | 418 | |
416 | | if (*curr_str == '\\') |
| 419 | if (replace_str->len > current_pos + 1 && |
| 420 | *curr_str == '\\') |
417 | 421 | { |
| 422 | char next_char; |
418 | 423 | if (strutils_is_char_escaped (replace_str->str, curr_str)) |
419 | 424 | { |
420 | 425 | *skip_len = 1; |
421 | | return -1; |
| 426 | return -1; /* nothing special */ |
422 | 427 | } |
423 | 428 | |
| 429 | next_char = *(curr_str + 1); |
424 | 430 | if (g_ascii_isdigit (*(curr_str + 1))) |
425 | 431 | { |
426 | | ret = g_ascii_digit_value (*(curr_str + 1)); |
| 432 | ret = g_ascii_digit_value (*(curr_str + 1)); /* capture buffer index >= 0 */ |
427 | 433 | *skip_len = 2; /* \\ and one digit */ |
428 | 434 | return ret; |
429 | 435 | } |
430 | | ret = -2; |
| 436 | |
| 437 | if (replace_str->len > current_pos+2) |
| 438 | { |
| 439 | if (next_char == '{') |
| 440 | { |
| 441 | for (*skip_len = 2; /* \{ */ |
| 442 | current_pos + *skip_len < replace_str->len |
| 443 | && (*(curr_str + *skip_len)) != '}'; (*skip_len)++); |
| 444 | if (current_pos + *skip_len < replace_str->len) /* } */ |
| 445 | (*skip_len)++; |
| 446 | return -3; /* escape sequence */ |
| 447 | } |
| 448 | |
| 449 | if (next_char == 'x') |
| 450 | { |
| 451 | *skip_len = 2; /* \x */ |
| 452 | next_char = *(curr_str + 2); |
| 453 | if (next_char == '{') |
| 454 | { |
| 455 | for (*skip_len = 3; /* \x{ */ |
| 456 | current_pos + *skip_len < replace_str->len |
| 457 | && (*(curr_str + *skip_len)) != '}'; (*skip_len)++); |
| 458 | if (current_pos + *skip_len < replace_str->len) |
| 459 | (*skip_len)++; |
| 460 | return -3; /* escape sequence */ |
| 461 | } |
| 462 | else if ((next_char < '0' || next_char > '9') && |
| 463 | (next_char < 'a' || next_char > 'f') && |
| 464 | (next_char < 'A' || next_char > 'F')) |
| 465 | { |
| 466 | *skip_len = 2; /* \x without number behind */ |
| 467 | return -1; /* nothing special */ |
| 468 | } |
| 469 | else |
| 470 | { |
| 471 | next_char = *(curr_str + 3); |
| 472 | if ((next_char < '0' || next_char > '9') && |
| 473 | (next_char < 'a' || next_char > 'f') && |
| 474 | (next_char < 'A' || next_char > 'F')) |
| 475 | *skip_len = 3; /* \xH */ |
| 476 | else |
| 477 | *skip_len = 4; /* \xHH */ |
| 478 | return -3; /* escape sequence */ |
| 479 | } |
| 480 | } |
| 481 | } |
| 482 | |
| 483 | if (next_char == 'n' || next_char == 't' || |
| 484 | next_char == 'v' || next_char == 'b' || |
| 485 | next_char == 'r' || next_char == 'f' || |
| 486 | next_char == 'a') |
| 487 | { |
| 488 | *skip_len = 2; |
| 489 | return -3; /* escape sequence */ |
| 490 | } |
| 491 | |
| 492 | ret = -2; /* replace flag */ |
431 | 493 | *skip_len += 2; |
432 | | switch (*(curr_str + 1)) |
| 494 | switch (next_char) |
433 | 495 | { |
434 | 496 | case 'U': |
435 | 497 | *replace_flags |= REPLACE_T_UPP_TRANSFORM; |
… |
… |
mc_search_regex__process_replace_str (const GString * replace_str, const gsize c |
449 | 511 | *replace_flags = REPLACE_T_NO_TRANSFORM; |
450 | 512 | break; |
451 | 513 | default: |
452 | | ret = -1; |
| 514 | ret = -1; /* nothing special */ |
453 | 515 | break; |
454 | 516 | } |
455 | 517 | } |
… |
… |
mc_search_regex__process_append_str (GString * dest_str, const char *from, gsize |
517 | 579 | |
518 | 580 | } |
519 | 581 | |
| 582 | static void |
| 583 | mc_search_regex__process_escape_sequence (GString * dest_str, const char *from, gsize len, |
| 584 | replace_transform_type_t * replace_flags) |
| 585 | { |
| 586 | gsize i = 0; |
| 587 | char c = 0; |
| 588 | if (len == (gsize) -1) |
| 589 | len = strlen (from); |
| 590 | if (len <= 0) |
| 591 | return; |
| 592 | if (from[i] == '{') |
| 593 | i++; |
| 594 | if (i >= len) |
| 595 | return; |
| 596 | if (from[i] == 'x') |
| 597 | { |
| 598 | i++; |
| 599 | if (i < len && from[i] == '{') |
| 600 | i++; |
| 601 | for (; i < len; i++) |
| 602 | { |
| 603 | if (from[i] >= '0' && from[i] <= '9') |
| 604 | c = c*16 + from[i] - '0'; |
| 605 | else if (from[i] >= 'a' && from[i] <= 'f') |
| 606 | c = c*16 + 10 + from[i] - 'a'; |
| 607 | else if (from[i] >= 'A' && from[i] <= 'F') |
| 608 | c = c*16 + 10 + from[i] - 'A'; |
| 609 | else |
| 610 | break; |
| 611 | } |
| 612 | } |
| 613 | else if (from[i] >= '0' && from[i] <= '9') |
| 614 | for (; i < len && from[i] >= '0' && from[i] <= '7'; i++) |
| 615 | c = c*8 + from[i] - '0'; |
| 616 | else |
| 617 | { |
| 618 | switch (from[i]) |
| 619 | { |
| 620 | case 'n': c = '\n'; break; |
| 621 | case 't': c = '\t'; break; |
| 622 | case 'v': c = '\v'; break; |
| 623 | case 'b': c = '\b'; break; |
| 624 | case 'r': c = '\r'; break; |
| 625 | case 'f': c = '\f'; break; |
| 626 | case 'a': c = '\a'; break; |
| 627 | default: |
| 628 | mc_search_regex__process_append_str(dest_str, from, len, replace_flags); |
| 629 | return; |
| 630 | } |
| 631 | } |
| 632 | g_string_append_len (dest_str, &c, 1); |
| 633 | } |
| 634 | |
520 | 635 | /*** public functions ****************************************************************************/ |
521 | 636 | |
522 | 637 | void |
… |
… |
mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_ |
526 | 641 | #ifdef SEARCH_TYPE_GLIB |
527 | 642 | GError *error = NULL; |
528 | 643 | #else /* SEARCH_TYPE_GLIB */ |
| 644 | int utf8 = 0; |
529 | 645 | const char *error; |
530 | 646 | int erroffset; |
531 | 647 | #endif /* SEARCH_TYPE_GLIB */ |
532 | 648 | |
| 649 | #ifdef SEARCH_TYPE_GLIB |
533 | 650 | if (!lc_mc_search->is_case_sensitive) |
534 | 651 | { |
535 | 652 | GString *tmp; |
… |
… |
mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_ |
538 | 655 | mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp); |
539 | 656 | g_string_free (tmp, TRUE); |
540 | 657 | } |
541 | | #ifdef SEARCH_TYPE_GLIB |
542 | 658 | mc_search_cond->regex_handle = |
543 | 659 | g_regex_new (mc_search_cond->str->str, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_DOTALL, 0, |
544 | 660 | &error); |
… |
… |
mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_ |
551 | 667 | return; |
552 | 668 | } |
553 | 669 | #else /* SEARCH_TYPE_GLIB */ |
554 | | mc_search_cond->regex_handle = |
555 | | pcre_compile (mc_search_cond->str->str, PCRE_EXTRA, &error, &erroffset, NULL); |
| 670 | if (charset && !strncasecmp(charset, "utf-8", 6)) |
| 671 | utf8 = 1; |
| 672 | mc_search_cond->regex_handle = pcre_compile (mc_search_cond->str->str, |
| 673 | (!mc_search->is_case_sentitive ? PCRE_CASELESS : 0) | |
| 674 | PCRE_MULTILINE | |
| 675 | (utf8 ? PCRE_UTF8 : 0) | |
| 676 | PCRE_EXTRA, &error, &erroffset, NULL); |
556 | 677 | if (mc_search_cond->regex_handle == NULL) |
557 | 678 | { |
558 | 679 | lc_mc_search->error = MC_SEARCH_E_REGEX_COMPILE; |
… |
… |
mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * repla |
708 | 829 | { |
709 | 830 | lc_index = mc_search_regex__process_replace_str (replace_str, loop, &len, &replace_flags); |
710 | 831 | |
| 832 | /* nothing special */ |
711 | 833 | if (lc_index == -1) |
712 | 834 | { |
713 | 835 | if (len != 0) |
… |
… |
mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * repla |
723 | 845 | continue; |
724 | 846 | } |
725 | 847 | |
| 848 | /* replace flag (transform) */ |
726 | 849 | if (lc_index == -2) |
727 | 850 | { |
728 | 851 | if (loop) |
… |
… |
mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * repla |
734 | 857 | continue; |
735 | 858 | } |
736 | 859 | |
| 860 | /* escape sequence */ |
| 861 | if (lc_index == -3) { |
| 862 | mc_search_regex__process_append_str (ret, prev_str, |
| 863 | replace_str->str + loop - prev_str, |
| 864 | &replace_flags); |
| 865 | /* call process_escape_sequence without starting '\\' */ |
| 866 | mc_search_regex__process_escape_sequence (ret, replace_str->str + loop + 1, len - 1, |
| 867 | &replace_flags); |
| 868 | prev_str = replace_str->str + loop + len; |
| 869 | loop += len - 1; |
| 870 | continue; |
| 871 | } |
| 872 | |
| 873 | /* invalid capture buffer number */ |
737 | 874 | if (lc_index > lc_mc_search->num_results) |
738 | 875 | { |
739 | 876 | g_string_free (ret, TRUE); |