Ruby 3.2.1p31 (2023-02-08 revision 31819e82c88c6f8ecfaeb162519bfa26a14b21fd)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "gc.h"
27#include "id.h"
28#include "internal.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* FLAGS of RString
83 *
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
93 *
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
101 */
102
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
109
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
114 }\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
117 }\
118} while (0)
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120#if USE_RVARGC
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124} while (0)
125#else
126# define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130} while (0)
131#endif
132
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
136 }\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
139 }\
140} while (0)
141
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
147 }\
148 else {\
149 RSTRING(str)->as.heap.len--;\
150 }\
151} while (0)
152
153static inline bool
154str_enc_fastpath(VALUE str)
155{
156 // The overwhelming majority of strings are in one of these 3 encodings.
157 switch (ENCODING_GET_INLINED(str)) {
158 case ENCINDEX_ASCII_8BIT:
159 case ENCINDEX_UTF_8:
160 case ENCINDEX_US_ASCII:
161 return true;
162 default:
163 return false;
164 }
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231#if USE_RVARGC
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233#else
234 return RSTRING_EMBED_LEN_MAX + 1;
235#endif
236}
237
238bool
239rb_str_reembeddable_p(VALUE str)
240{
241 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242}
243
244static inline size_t
245rb_str_embed_size(long capa)
246{
247 return offsetof(struct RString, as.embed.ary) + capa;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254#if USE_RVARGC
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.embed.len) + TERM_LEN(str);
257 }
258 /* if the string is not currently embedded, but it can be embedded, how
259 * much space would it require */
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
262 }
263 else {
264#endif
265 real_size = sizeof(struct RString);
266#if USE_RVARGC
267 }
268#endif
269 return real_size;
270}
271
272static inline bool
273STR_EMBEDDABLE_P(long len, long termlen)
274{
275#if USE_RVARGC
276 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
277#else
278 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
279#endif
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->as.heap.len;
317
318 STR_SET_EMBED(str);
319 STR_SET_EMBED_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_str_update_shared_ary(VALUE str, VALUE old_root, VALUE new_root)
331{
332 // if the root location hasn't changed, we don't need to update
333 if (new_root == old_root) {
334 return;
335 }
336
337 // if the root string isn't embedded, we don't need to touch the ponter.
338 // it already points to the shame shared buffer
339 if (!STR_EMBED_P(new_root)) {
340 return;
341 }
342
343 size_t offset = (size_t)((uintptr_t)RSTRING(str)->as.heap.ptr - (uintptr_t)RSTRING(old_root)->as.embed.ary);
344
345 RUBY_ASSERT(RSTRING(str)->as.heap.ptr >= RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr = RSTRING(new_root)->as.embed.ary + offset;
347}
348
349void
350rb_debug_rstring_null_ptr(const char *func)
351{
352 fprintf(stderr, "%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
356 func);
357}
358
359/* symbols for [up|down|swap]case/capitalize options */
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
361
362static rb_encoding *
363get_encoding(VALUE str)
364{
365 return rb_enc_from_index(ENCODING_GET(str));
366}
367
368static void
369mustnot_broken(VALUE str)
370{
371 if (is_broken_string(str)) {
372 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
373 }
374}
375
376static void
377mustnot_wchar(VALUE str)
378{
379 rb_encoding *enc = STR_ENC_GET(str);
380 if (rb_enc_mbminlen(enc) > 1) {
381 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
382 }
383}
384
385static int fstring_cmp(VALUE a, VALUE b);
386
387static VALUE register_fstring(VALUE str, bool copy);
388
389const struct st_hash_type rb_fstring_hash_type = {
390 fstring_cmp,
392};
393
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
395
397 VALUE fstr;
398 bool copy;
399};
400
401static int
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
403{
404
405 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
406 VALUE str = (VALUE)*key;
407
408 if (existing) {
409 /* because of lazy sweep, str may be unmarked already and swept
410 * at next time */
411
412 if (rb_objspace_garbage_object_p(str)) {
413 arg->fstr = Qundef;
414 return ST_DELETE;
415 }
416
417 arg->fstr = str;
418 return ST_STOP;
419 }
420 else {
421 if (FL_TEST_RAW(str, STR_FAKESTR)) {
422 if (arg->copy) {
423 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
424 rb_enc_copy(new_str, str);
425 str = new_str;
426 }
427 else {
428 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
429 RSTRING(str)->as.heap.len,
430 ENCODING_GET(str));
431 }
432 OBJ_FREEZE_RAW(str);
433 }
434 else {
435 if (!OBJ_FROZEN(str))
436 str = str_new_frozen(rb_cString, str);
437 if (STR_SHARED_P(str)) { /* str should not be shared */
438 /* shared substring */
439 str_make_independent(str);
440 assert(OBJ_FROZEN(str));
441 }
442 if (!BARE_STRING_P(str)) {
443 str = str_new_frozen(rb_cString, str);
444 }
445 }
446 RBASIC(str)->flags |= RSTRING_FSTR;
447
448 *key = *value = arg->fstr = str;
449 return ST_CONTINUE;
450 }
451}
452
453RUBY_FUNC_EXPORTED
454VALUE
455rb_fstring(VALUE str)
456{
457 VALUE fstr;
458 int bare;
459
460 Check_Type(str, T_STRING);
461
462 if (FL_TEST(str, RSTRING_FSTR))
463 return str;
464
465 bare = BARE_STRING_P(str);
466 if (!bare) {
467 if (STR_EMBED_P(str)) {
468 OBJ_FREEZE_RAW(str);
469 return str;
470 }
471 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
472 assert(OBJ_FROZEN(str));
473 return str;
474 }
475 }
476
477 if (!OBJ_FROZEN(str))
478 rb_str_resize(str, RSTRING_LEN(str));
479
480 fstr = register_fstring(str, FALSE);
481
482 if (!bare) {
483 str_replace_shared_without_enc(str, fstr);
484 OBJ_FREEZE_RAW(str);
485 return str;
486 }
487 return fstr;
488}
489
490static VALUE
491register_fstring(VALUE str, bool copy)
492{
493 struct fstr_update_arg args;
494 args.copy = copy;
495
496 RB_VM_LOCK_ENTER();
497 {
498 st_table *frozen_strings = rb_vm_fstring_table();
499 do {
500 args.fstr = str;
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 } while (UNDEF_P(args.fstr));
503 }
504 RB_VM_LOCK_LEAVE();
505
506 assert(OBJ_FROZEN(args.fstr));
507 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
508 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
509 assert(RBASIC_CLASS(args.fstr) == rb_cString);
510 return args.fstr;
511}
512
513static VALUE
514setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
515{
516 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
517 /* SHARED to be allocated by the callback */
518
519 if (!name) {
520 RUBY_ASSERT_ALWAYS(len == 0);
521 name = "";
522 }
523
524 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
525
526 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
527 fake_str->as.heap.len = len;
528 fake_str->as.heap.ptr = (char *)name;
529 fake_str->as.heap.aux.capa = len;
530 return (VALUE)fake_str;
531}
532
533/*
534 * set up a fake string which refers a static string literal.
535 */
536VALUE
537rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
538{
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
540}
541
542/*
543 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
544 * shared string which refers a static string literal. `ptr` must
545 * point a constant string.
546 */
547MJIT_FUNC_EXPORTED VALUE
548rb_fstring_new(const char *ptr, long len)
549{
550 struct RString fake_str;
551 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
552}
553
554VALUE
555rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
556{
557 struct RString fake_str;
558 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
559}
560
561VALUE
562rb_fstring_cstr(const char *ptr)
563{
564 return rb_fstring_new(ptr, strlen(ptr));
565}
566
567static int
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
569{
570 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
571 return ST_CONTINUE;
572}
573
574static int
575fstring_cmp(VALUE a, VALUE b)
576{
577 long alen, blen;
578 const char *aptr, *bptr;
579 RSTRING_GETMEM(a, aptr, alen);
580 RSTRING_GETMEM(b, bptr, blen);
581 return (alen != blen ||
582 ENCODING_GET(a) != ENCODING_GET(b) ||
583 memcmp(aptr, bptr, alen) != 0);
584}
585
586static inline int
587single_byte_optimizable(VALUE str)
588{
589 rb_encoding *enc;
590
591 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
593 return 1;
594
595 enc = STR_ENC_GET(str);
596 if (rb_enc_mbmaxlen(enc) == 1)
597 return 1;
598
599 /* Conservative. Possibly single byte.
600 * "\xa1" in Shift_JIS for example. */
601 return 0;
602}
603
605
606static inline const char *
607search_nonascii(const char *p, const char *e)
608{
609 const uintptr_t *s, *t;
610
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
616# else
617# error "don't know what to do."
618# endif
619#else
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL /* or...? */
624# else
625# error "don't know what to do."
626# endif
627#endif
628
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
633 p += l;
634 switch (l) {
635 default: UNREACHABLE;
636#if SIZEOF_VOIDP > 4
637 case 7: if (p[-7]&0x80) return p-7;
638 case 6: if (p[-6]&0x80) return p-6;
639 case 5: if (p[-5]&0x80) return p-5;
640 case 4: if (p[-4]&0x80) return p-4;
641#endif
642 case 3: if (p[-3]&0x80) return p-3;
643 case 2: if (p[-2]&0x80) return p-2;
644 case 1: if (p[-1]&0x80) return p-1;
645 case 0: break;
646 }
647 }
648#endif
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
652#else
653#define aligned_ptr(value) (uintptr_t *)(value)
654#endif
655 s = aligned_ptr(p);
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
657#undef aligned_ptr
658 for (;s < t; s++) {
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
662#else
663 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
664#endif
665 }
666 }
667 p = (const char *)s;
668 }
669
670 switch (e - p) {
671 default: UNREACHABLE;
672#if SIZEOF_VOIDP > 4
673 case 7: if (e[-7]&0x80) return e-7;
674 case 6: if (e[-6]&0x80) return e-6;
675 case 5: if (e[-5]&0x80) return e-5;
676 case 4: if (e[-4]&0x80) return e-4;
677#endif
678 case 3: if (e[-3]&0x80) return e-3;
679 case 2: if (e[-2]&0x80) return e-2;
680 case 1: if (e[-1]&0x80) return e-1;
681 case 0: return NULL;
682 }
683}
684
685static int
686coderange_scan(const char *p, long len, rb_encoding *enc)
687{
688 const char *e = p + len;
689
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
691 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
692 p = search_nonascii(p, e);
694 }
695
696 if (rb_enc_asciicompat(enc)) {
697 p = search_nonascii(p, e);
698 if (!p) return ENC_CODERANGE_7BIT;
699 for (;;) {
700 int ret = rb_enc_precise_mbclen(p, e, enc);
702 p += MBCLEN_CHARFOUND_LEN(ret);
703 if (p == e) break;
704 p = search_nonascii(p, e);
705 if (!p) break;
706 }
707 }
708 else {
709 while (p < e) {
710 int ret = rb_enc_precise_mbclen(p, e, enc);
712 p += MBCLEN_CHARFOUND_LEN(ret);
713 }
714 }
715 return ENC_CODERANGE_VALID;
716}
717
718long
719rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
720{
721 const char *p = s;
722
723 if (*cr == ENC_CODERANGE_BROKEN)
724 return e - s;
725
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
727 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
728 if (*cr == ENC_CODERANGE_VALID) return e - s;
729 p = search_nonascii(p, e);
731 return e - s;
732 }
733 else if (rb_enc_asciicompat(enc)) {
734 p = search_nonascii(p, e);
735 if (!p) {
736 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
737 return e - s;
738 }
739 for (;;) {
740 int ret = rb_enc_precise_mbclen(p, e, enc);
741 if (!MBCLEN_CHARFOUND_P(ret)) {
743 return p - s;
744 }
745 p += MBCLEN_CHARFOUND_LEN(ret);
746 if (p == e) break;
747 p = search_nonascii(p, e);
748 if (!p) break;
749 }
750 }
751 else {
752 while (p < e) {
753 int ret = rb_enc_precise_mbclen(p, e, enc);
754 if (!MBCLEN_CHARFOUND_P(ret)) {
756 return p - s;
757 }
758 p += MBCLEN_CHARFOUND_LEN(ret);
759 }
760 }
762 return e - s;
763}
764
765static inline void
766str_enc_copy(VALUE str1, VALUE str2)
767{
768 rb_enc_set_index(str1, ENCODING_GET(str2));
769}
770
771static void
772rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
773{
774 /* this function is designed for copying encoding and coderange
775 * from src to new string "dest" which is made from the part of src.
776 */
777 str_enc_copy(dest, src);
778 if (RSTRING_LEN(dest) == 0) {
779 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
781 else
783 return;
784 }
785 switch (ENC_CODERANGE(src)) {
788 break;
790 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
791 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
793 else
795 break;
796 default:
797 break;
798 }
799}
800
801static void
802rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
803{
804 str_enc_copy(dest, src);
806}
807
808static int
809enc_coderange_scan(VALUE str, rb_encoding *enc)
810{
811 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
812}
813
814int
815rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
816{
817 return enc_coderange_scan(str, enc);
818}
819
820int
822{
823 int cr = ENC_CODERANGE(str);
824
825 if (cr == ENC_CODERANGE_UNKNOWN) {
826 cr = enc_coderange_scan(str, get_encoding(str));
827 ENC_CODERANGE_SET(str, cr);
828 }
829 return cr;
830}
831
832int
834{
835 rb_encoding *enc = STR_ENC_GET(str);
836
837 if (!rb_enc_asciicompat(enc))
838 return FALSE;
839 else if (is_ascii_string(str))
840 return TRUE;
841 return FALSE;
842}
843
844static inline void
845str_mod_check(VALUE s, const char *p, long len)
846{
847 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
848 rb_raise(rb_eRuntimeError, "string modified");
849 }
850}
851
852static size_t
853str_capacity(VALUE str, const int termlen)
854{
855 if (STR_EMBED_P(str)) {
856#if USE_RVARGC
857 return str_embed_capa(str) - termlen;
858#else
859 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
860#endif
861 }
862 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
864 }
865 else {
866 return RSTRING(str)->as.heap.aux.capa;
867 }
868}
869
870size_t
872{
873 return str_capacity(str, TERM_LEN(str));
874}
875
876static inline void
877must_not_null(const char *ptr)
878{
879 if (!ptr) {
880 rb_raise(rb_eArgError, "NULL pointer given");
881 }
882}
883
884static inline VALUE
885str_alloc_embed(VALUE klass, size_t capa)
886{
887 size_t size = rb_str_embed_size(capa);
888 assert(size > 0);
889 assert(rb_gc_size_allocatable_p(size));
890#if !USE_RVARGC
891 assert(size <= sizeof(struct RString));
892#endif
893
894 RVARGC_NEWOBJ_OF(str, struct RString, klass,
896
897 return (VALUE)str;
898}
899
900static inline VALUE
901str_alloc_heap(VALUE klass)
902{
903 RVARGC_NEWOBJ_OF(str, struct RString, klass,
904 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
905
906 return (VALUE)str;
907}
908
909static inline VALUE
910empty_str_alloc(VALUE klass)
911{
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
915 return str;
916}
917
918static VALUE
919str_new0(VALUE klass, const char *ptr, long len, int termlen)
920{
921 VALUE str;
922
923 if (len < 0) {
924 rb_raise(rb_eArgError, "negative string size (or size too big)");
925 }
926
927 RUBY_DTRACE_CREATE_HOOK(STRING, len);
928
929 if (STR_EMBEDDABLE_P(len, termlen)) {
930 str = str_alloc_embed(klass, len + termlen);
931 if (len == 0) {
933 }
934 }
935 else {
936 str = str_alloc_heap(klass);
937 RSTRING(str)->as.heap.aux.capa = len;
938 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
939 * integer overflow. If we can STATIC_ASSERT that, the following
940 * mul_add_mul can be reverted to a simple ALLOC_N. */
941 RSTRING(str)->as.heap.ptr =
942 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
943 }
944 if (ptr) {
945 memcpy(RSTRING_PTR(str), ptr, len);
946 }
947 STR_SET_LEN(str, len);
948 TERM_FILL(RSTRING_PTR(str) + len, termlen);
949 return str;
950}
951
952static VALUE
953str_new(VALUE klass, const char *ptr, long len)
954{
955 return str_new0(klass, ptr, len, 1);
956}
957
958VALUE
959rb_str_new(const char *ptr, long len)
960{
961 return str_new(rb_cString, ptr, len);
962}
963
964VALUE
965rb_usascii_str_new(const char *ptr, long len)
966{
967 VALUE str = rb_str_new(ptr, len);
968 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
969 return str;
970}
971
972VALUE
973rb_utf8_str_new(const char *ptr, long len)
974{
975 VALUE str = str_new(rb_cString, ptr, len);
976 rb_enc_associate_index(str, rb_utf8_encindex());
977 return str;
978}
979
980VALUE
981rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
982{
983 VALUE str;
984
985 if (!enc) return rb_str_new(ptr, len);
986
987 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
988 rb_enc_associate(str, enc);
989 return str;
990}
991
992VALUE
994{
995 must_not_null(ptr);
996 /* rb_str_new_cstr() can take pointer from non-malloc-generated
997 * memory regions, and that cannot be detected by the MSAN. Just
998 * trust the programmer that the argument passed here is a sane C
999 * string. */
1000 __msan_unpoison_string(ptr);
1001 return rb_str_new(ptr, strlen(ptr));
1002}
1003
1004VALUE
1006{
1007 VALUE str = rb_str_new_cstr(ptr);
1008 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
1009 return str;
1010}
1011
1012VALUE
1014{
1015 VALUE str = rb_str_new_cstr(ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1017 return str;
1018}
1019
1020VALUE
1021rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1022{
1023 must_not_null(ptr);
1024 if (rb_enc_mbminlen(enc) != 1) {
1025 rb_raise(rb_eArgError, "wchar encoding given");
1026 }
1027 return rb_enc_str_new(ptr, strlen(ptr), enc);
1028}
1029
1030static VALUE
1031str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (!ptr) {
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1041 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1042 }
1043 else {
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045 str = str_alloc_heap(klass);
1046 RSTRING(str)->as.heap.len = len;
1047 RSTRING(str)->as.heap.ptr = (char *)ptr;
1048 RSTRING(str)->as.heap.aux.capa = len;
1049 RBASIC(str)->flags |= STR_NOFREE;
1050 }
1051 rb_enc_associate_index(str, encindex);
1052 return str;
1053}
1054
1055VALUE
1056rb_str_new_static(const char *ptr, long len)
1057{
1058 return str_new_static(rb_cString, ptr, len, 0);
1059}
1060
1061VALUE
1063{
1064 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1065}
1066
1067VALUE
1069{
1070 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1071}
1072
1073VALUE
1075{
1076 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1077}
1078
1079static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1080 rb_encoding *from, rb_encoding *to,
1081 int ecflags, VALUE ecopts);
1082
1083static inline bool
1084is_enc_ascii_string(VALUE str, rb_encoding *enc)
1085{
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1089 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1090}
1091
1092VALUE
1093rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1094{
1095 long len;
1096 const char *ptr;
1097 VALUE newstr;
1098
1099 if (!to) return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to) return str;
1102 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1105 str = rb_str_dup(str);
1106 rb_enc_associate(str, to);
1107 }
1108 return str;
1109 }
1110
1111 RSTRING_GETMEM(str, ptr, len);
1112 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1113 from, to, ecflags, ecopts);
1114 if (NIL_P(newstr)) {
1115 /* some error, return original */
1116 return str;
1117 }
1118 return newstr;
1119}
1120
1121VALUE
1122rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1123 rb_encoding *from, int ecflags, VALUE ecopts)
1124{
1125 long olen;
1126
1127 olen = RSTRING_LEN(newstr);
1128 if (ofs < -olen || olen < ofs)
1129 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1130 if (ofs < 0) ofs += olen;
1131 if (!from) {
1132 STR_SET_LEN(newstr, ofs);
1133 return rb_str_cat(newstr, ptr, len);
1134 }
1135
1136 rb_str_modify(newstr);
1137 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1138 rb_enc_get(newstr),
1139 ecflags, ecopts);
1140}
1141
1142VALUE
1143rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1144{
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1147 rb_str_cat(str, ptr, len);
1148 return str;
1149}
1150
1151static VALUE
1152str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1153 rb_encoding *from, rb_encoding *to,
1154 int ecflags, VALUE ecopts)
1155{
1156 rb_econv_t *ec;
1158 long olen;
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1163
1164 olen = rb_str_capacity(newstr);
1165
1166 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1168 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1169 if (!ec) return Qnil;
1170 DATA_PTR(econv_wrapper) = ec;
1171
1172 sp = (unsigned char*)ptr;
1173 start = sp;
1174 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1176 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1178 /* destination buffer short */
1179 size_t converted_input = sp - start;
1180 size_t rest = len - converted_input;
1181 converted_output = dp - dest;
1182 rb_str_set_len(newstr, converted_output);
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1186 }
1187 else {
1188 rest = olen;
1189 }
1190 olen += rest < 2 ? 2 : rest;
1191 rb_str_resize(newstr, olen);
1192 }
1193 DATA_PTR(econv_wrapper) = 0;
1194 rb_econv_close(ec);
1195 switch (ret) {
1196 case econv_finished:
1197 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1198 rb_str_set_len(newstr, len);
1199 rb_enc_associate(newstr, to);
1200 return newstr;
1201
1202 default:
1203 return Qnil;
1204 }
1205}
1206
1207VALUE
1209{
1210 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1211}
1212
1213VALUE
1215{
1216 rb_encoding *ienc;
1217 VALUE str;
1218 const int eidx = rb_enc_to_index(eenc);
1219
1220 if (!ptr) {
1221 return rb_enc_str_new(ptr, len, eenc);
1222 }
1223
1224 /* ASCII-8BIT case, no conversion */
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1227 return rb_str_new(ptr, len);
1228 }
1229 /* no default_internal or same encoding, no conversion */
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1232 return rb_enc_str_new(ptr, len, eenc);
1233 }
1234 /* ASCII compatible, and ASCII only string, no conversion in
1235 * default_internal */
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1238 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1239 return rb_enc_str_new(ptr, len, ienc);
1240 }
1241 /* convert from the given encoding to default_internal */
1242 str = rb_enc_str_new(NULL, 0, ienc);
1243 /* when the conversion failed for some reason, just ignore the
1244 * default_internal and result in the given encoding as-is. */
1245 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1246 rb_str_initialize(str, ptr, len, eenc);
1247 }
1248 return str;
1249}
1250
1251VALUE
1252rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1253{
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1258 return str;
1259 }
1260 rb_enc_associate_index(str, eidx);
1261 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1262}
1263
1264VALUE
1265rb_external_str_new(const char *ptr, long len)
1266{
1267 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1268}
1269
1270VALUE
1272{
1273 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1274}
1275
1276VALUE
1277rb_locale_str_new(const char *ptr, long len)
1278{
1279 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1280}
1281
1282VALUE
1284{
1285 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1286}
1287
1288VALUE
1290{
1291 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1292}
1293
1294VALUE
1296{
1297 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1298}
1299
1300VALUE
1302{
1303 return rb_str_export_to_enc(str, rb_default_external_encoding());
1304}
1305
1306VALUE
1308{
1309 return rb_str_export_to_enc(str, rb_locale_encoding());
1310}
1311
1312VALUE
1314{
1315 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1316}
1317
1318static VALUE
1319str_replace_shared_without_enc(VALUE str2, VALUE str)
1320{
1321 const int termlen = TERM_LEN(str);
1322 char *ptr;
1323 long len;
1324
1325 RSTRING_GETMEM(str, ptr, len);
1326 if (str_embed_capa(str2) >= len + termlen) {
1327 char *ptr2 = RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1329 memcpy(ptr2, RSTRING_PTR(str), len);
1330 STR_SET_EMBED_LEN(str2, len);
1331 TERM_FILL(ptr2+len, termlen);
1332 }
1333 else {
1334 VALUE root;
1335 if (STR_SHARED_P(str)) {
1336 root = RSTRING(str)->as.heap.aux.shared;
1337 RSTRING_GETMEM(str, ptr, len);
1338 }
1339 else {
1340 root = rb_str_new_frozen(str);
1341 RSTRING_GETMEM(root, ptr, len);
1342 }
1343 assert(OBJ_FROZEN(root));
1344 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1345 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1346 rb_fatal("about to free a possible shared root");
1347 }
1348 char *ptr2 = STR_HEAP_PTR(str2);
1349 if (ptr2 != ptr) {
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1351 }
1352 }
1353 FL_SET(str2, STR_NOEMBED);
1354 RSTRING(str2)->as.heap.len = len;
1355 RSTRING(str2)->as.heap.ptr = ptr;
1356 STR_SET_SHARED(str2, root);
1357 }
1358 return str2;
1359}
1360
1361static VALUE
1362str_replace_shared(VALUE str2, VALUE str)
1363{
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1366 return str2;
1367}
1368
1369static VALUE
1370str_new_shared(VALUE klass, VALUE str)
1371{
1372 return str_replace_shared(str_alloc_heap(klass), str);
1373}
1374
1375VALUE
1377{
1378 return str_new_shared(rb_obj_class(str), str);
1379}
1380
1381VALUE
1383{
1384 if (OBJ_FROZEN(orig)) return orig;
1385 return str_new_frozen(rb_obj_class(orig), orig);
1386}
1387
1388static VALUE
1389rb_str_new_frozen_String(VALUE orig)
1390{
1391 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1392 return str_new_frozen(rb_cString, orig);
1393}
1394
1395VALUE
1396rb_str_tmp_frozen_acquire(VALUE orig)
1397{
1398 if (OBJ_FROZEN_RAW(orig)) return orig;
1399 return str_new_frozen_buffer(0, orig, FALSE);
1400}
1401
1402void
1403rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1404{
1405 if (RBASIC_CLASS(tmp) != 0)
1406 return;
1407
1408 if (STR_EMBED_P(tmp)) {
1409 assert(OBJ_FROZEN_RAW(tmp));
1410 }
1411 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1412 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1413 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1414
1415 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1416 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1417 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1418
1419 /* Unshare orig since the root (tmp) only has this one child. */
1420 FL_UNSET_RAW(orig, STR_SHARED);
1421 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1423 assert(OBJ_FROZEN_RAW(tmp));
1424
1425 /* Make tmp embedded and empty so it is safe for sweeping. */
1426 STR_SET_EMBED(tmp);
1427 STR_SET_EMBED_LEN(tmp, 0);
1428 }
1429 }
1430}
1431
1432static VALUE
1433str_new_frozen(VALUE klass, VALUE orig)
1434{
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1436}
1437
1438static VALUE
1439heap_str_make_shared(VALUE klass, VALUE orig)
1440{
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1443
1444 VALUE str = str_alloc_heap(klass);
1445 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1446 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1447 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1451 if (klass == 0)
1452 FL_UNSET_RAW(str, STR_BORROWED);
1453 return str;
1454}
1455
1456static VALUE
1457str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1458{
1459 VALUE str;
1460
1461 long len = RSTRING_LEN(orig);
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1463
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1465 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1466 assert(STR_EMBED_P(str));
1467 }
1468 else {
1469 if (FL_TEST_RAW(orig, STR_SHARED)) {
1470 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1471 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1472 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1473 assert(ofs >= 0);
1474 assert(rest >= 0);
1475 assert(ofs + rest <= RSTRING_LEN(shared));
1476#if !USE_RVARGC
1477 assert(!STR_EMBED_P(shared));
1478#endif
1479 assert(OBJ_FROZEN(shared));
1480
1481 if ((ofs > 0) || (rest > 0) ||
1482 (klass != RBASIC(shared)->klass) ||
1483 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1484 str = str_new_shared(klass, shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1488 }
1489 else {
1490 if (RBASIC_CLASS(shared) == 0)
1491 FL_SET_RAW(shared, STR_BORROWED);
1492 return shared;
1493 }
1494 }
1495 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1497 STR_SET_EMBED(str);
1498 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1499 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1500 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1501 }
1502 else {
1503 str = heap_str_make_shared(klass, orig);
1504 }
1505 }
1506
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1508 OBJ_FREEZE(str);
1509 return str;
1510}
1511
1512VALUE
1513rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1514{
1515 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1516}
1517
1518static VALUE
1519str_new_empty_String(VALUE str)
1520{
1521 VALUE v = rb_str_new(0, 0);
1522 rb_enc_copy(v, str);
1523 return v;
1524}
1525
1526#define STR_BUF_MIN_SIZE 63
1527#if !USE_RVARGC
1528STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1529#endif
1530
1531VALUE
1533{
1534 if (STR_EMBEDDABLE_P(capa, 1)) {
1535 return str_alloc_embed(rb_cString, capa + 1);
1536 }
1537
1538 VALUE str = str_alloc_heap(rb_cString);
1539
1540#if !USE_RVARGC
1541 if (capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1543 }
1544#endif
1545 RSTRING(str)->as.heap.aux.capa = capa;
1546 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1547 RSTRING(str)->as.heap.ptr[0] = '\0';
1548
1549 return str;
1550}
1551
1552VALUE
1554{
1555 VALUE str;
1556 long len = strlen(ptr);
1557
1558 str = rb_str_buf_new(len);
1559 rb_str_buf_cat(str, ptr, len);
1560
1561 return str;
1562}
1563
1564VALUE
1566{
1567 return str_new(0, 0, len);
1568}
1569
1570void
1572{
1573 if (FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1575
1576 RB_VM_LOCK_ENTER();
1577 {
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1580 }
1581 RB_VM_LOCK_LEAVE();
1582 }
1583
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1586 }
1587 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1590 }
1591 else {
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1594 }
1595}
1596
1597RUBY_FUNC_EXPORTED size_t
1598rb_str_memsize(VALUE str)
1599{
1600 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1602 }
1603 else {
1604 return 0;
1605 }
1606}
1607
1608VALUE
1610{
1611 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1612}
1613
1614static inline void str_discard(VALUE str);
1615static void str_shared_replace(VALUE str, VALUE str2);
1616
1617void
1619{
1620 if (str != str2) str_shared_replace(str, str2);
1621}
1622
1623static void
1624str_shared_replace(VALUE str, VALUE str2)
1625{
1626 rb_encoding *enc;
1627 int cr;
1628 int termlen;
1629
1630 RUBY_ASSERT(str2 != str);
1631 enc = STR_ENC_GET(str2);
1632 cr = ENC_CODERANGE(str2);
1633 str_discard(str);
1634 termlen = rb_enc_mbminlen(enc);
1635
1636 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1637 STR_SET_EMBED(str);
1638 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1639 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1640 rb_enc_associate(str, enc);
1641 ENC_CODERANGE_SET(str, cr);
1642 }
1643 else {
1644#if USE_RVARGC
1645 if (STR_EMBED_P(str2)) {
1646 assert(!FL_TEST(str2, STR_SHARED));
1647 long len = RSTRING(str2)->as.embed.len;
1648 assert(len + termlen <= str_embed_capa(str2));
1649
1650 char *new_ptr = ALLOC_N(char, len + termlen);
1651 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1653 RSTRING(str2)->as.heap.len = len;
1654 RSTRING(str2)->as.heap.aux.capa = len;
1655 STR_SET_NOEMBED(str2);
1656 }
1657#endif
1658
1659 STR_SET_NOEMBED(str);
1660 FL_UNSET(str, STR_SHARED);
1661 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1662 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1663
1664 if (FL_TEST(str2, STR_SHARED)) {
1665 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1666 STR_SET_SHARED(str, shared);
1667 }
1668 else {
1669 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1670 }
1671
1672 /* abandon str2 */
1673 STR_SET_EMBED(str2);
1674 RSTRING_PTR(str2)[0] = 0;
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1677 ENC_CODERANGE_SET(str, cr);
1678 }
1679}
1680
1681VALUE
1683{
1684 VALUE str;
1685
1686 if (RB_TYPE_P(obj, T_STRING)) {
1687 return obj;
1688 }
1689 str = rb_funcall(obj, idTo_s, 0);
1690 return rb_obj_as_string_result(str, obj);
1691}
1692
1693MJIT_FUNC_EXPORTED VALUE
1694rb_obj_as_string_result(VALUE str, VALUE obj)
1695{
1696 if (!RB_TYPE_P(str, T_STRING))
1697 return rb_any_to_s(obj);
1698 return str;
1699}
1700
1701static VALUE
1702str_replace(VALUE str, VALUE str2)
1703{
1704 long len;
1705
1706 len = RSTRING_LEN(str2);
1707 if (STR_SHARED_P(str2)) {
1708 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1709 assert(OBJ_FROZEN(shared));
1710 STR_SET_NOEMBED(str);
1711 RSTRING(str)->as.heap.len = len;
1712 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1713 STR_SET_SHARED(str, shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1715 }
1716 else {
1717 str_replace_shared(str, str2);
1718 }
1719
1720 return str;
1721}
1722
1723static inline VALUE
1724ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1725{
1726 size_t size = rb_str_embed_size(capa);
1727 assert(size > 0);
1728 assert(rb_gc_size_allocatable_p(size));
1729#if !USE_RVARGC
1730 assert(size <= sizeof(struct RString));
1731#endif
1732
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1735
1736 return (VALUE)str;
1737}
1738
1739static inline VALUE
1740ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1741{
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1743 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
1744
1745 return (VALUE)str;
1746}
1747
1748static inline VALUE
1749str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1750{
1751 const VALUE flag_mask =
1752#if !USE_RVARGC
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1754#endif
1756 FL_FREEZE
1757 ;
1758 VALUE flags = FL_TEST_RAW(str, flag_mask);
1759 int encidx = 0;
1760 if (STR_EMBED_P(str)) {
1761 long len = RSTRING_EMBED_LEN(str);
1762
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >= len + 1);
1765 STR_SET_EMBED_LEN(dup, len);
1766 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1767 }
1768 else {
1769 VALUE root = str;
1770 if (FL_TEST_RAW(str, STR_SHARED)) {
1771 root = RSTRING(str)->as.heap.aux.shared;
1772 }
1773 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1775 flags = FL_TEST_RAW(str, flag_mask);
1776 }
1777 assert(!STR_SHARED_P(root));
1778 assert(RB_OBJ_FROZEN_RAW(root));
1779 if (0) {}
1780#if !USE_RVARGC
1781 else if (STR_EMBED_P(root)) {
1782 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1783 char, RSTRING_EMBED_LEN_MAX + 1);
1784 FL_UNSET(dup, STR_NOEMBED);
1785 }
1786#endif
1787 else {
1788 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1789 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1790 FL_SET(root, STR_SHARED_ROOT);
1791 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1793 }
1794 }
1795
1796 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1799 }
1800 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1802 return dup;
1803}
1804
1805static inline VALUE
1806ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1807{
1808 VALUE dup;
1809 if (FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1811 }
1812 else {
1813 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1814 }
1815
1816 return str_duplicate_setup(klass, str, dup);
1817}
1818
1819static inline VALUE
1820str_duplicate(VALUE klass, VALUE str)
1821{
1822 VALUE dup;
1823 if (FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1825 }
1826 else {
1827 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1828 }
1829
1830 return str_duplicate_setup(klass, str, dup);
1831}
1832
1833VALUE
1835{
1836 return str_duplicate(rb_obj_class(str), str);
1837}
1838
1839VALUE
1841{
1842 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1843 return str_duplicate(rb_cString, str);
1844}
1845
1846VALUE
1847rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1848{
1849 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1850 return ec_str_duplicate(ec, rb_cString, str);
1851}
1852
1853/*
1854 *
1855 * call-seq:
1856 * String.new(string = '', **opts) -> new_string
1857 *
1858 * :include: doc/string/new.rdoc
1859 *
1860 */
1861
1862static VALUE
1863rb_str_init(int argc, VALUE *argv, VALUE str)
1864{
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1867 VALUE kwargs[2];
1868 rb_encoding *enc = 0;
1869 int n;
1870
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1], "capacity");
1874 }
1875
1876 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1877 if (!NIL_P(opt)) {
1878 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1879 venc = kwargs[0];
1880 vcapa = kwargs[1];
1881 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1883 }
1884 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1885 long capa = NUM2LONG(vcapa);
1886 long len = 0;
1887 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1888
1889 if (capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1891 }
1892 if (n == 1) {
1893 StringValue(orig);
1894 len = RSTRING_LEN(orig);
1895 if (capa < len) {
1896 capa = len;
1897 }
1898 if (orig == str) n = 0;
1899 }
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) { /* make noembed always */
1902 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1903#if USE_RVARGC
1904 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1905 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1906#else
1907 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1908#endif
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1910 }
1911 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)capa + termlen;
1913 const char *const old_ptr = RSTRING_PTR(str);
1914 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1917 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1919 }
1920 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1921 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1922 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1923 }
1924 RSTRING(str)->as.heap.len = len;
1925 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1926 if (n == 1) {
1927 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1928 rb_enc_cr_str_exact_copy(str, orig);
1929 }
1930 FL_SET(str, STR_NOEMBED);
1931 RSTRING(str)->as.heap.aux.capa = capa;
1932 }
1933 else if (n == 1) {
1934 rb_str_replace(str, orig);
1935 }
1936 if (enc) {
1937 rb_enc_associate(str, enc);
1939 }
1940 }
1941 else if (n == 1) {
1942 rb_str_replace(str, orig);
1943 }
1944 return str;
1945}
1946
1947#ifdef NONASCII_MASK
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1949
1950/*
1951 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1952 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1953 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1954 *
1955 * if (!(byte & 0x80))
1956 * byte |= 0x40; // turn on bit6
1957 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1958 *
1959 * This function calculates whether a byte is leading or not for all bytes
1960 * in the argument word by concurrently using the above logic, and then
1961 * adds up the number of leading bytes in the word.
1962 */
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(const uintptr_t *s)
1965{
1966 uintptr_t d = *s;
1967
1968 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1971
1972 /* Gather all bytes. */
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1974 /* use only if it can use POPCNT */
1975 return rb_popcount_intptr(d);
1976#else
1977 d += (d>>8);
1978 d += (d>>16);
1979# if SIZEOF_VOIDP == 8
1980 d += (d>>32);
1981# endif
1982 return (d&0xF);
1983#endif
1984}
1985#endif
1986
1987static inline long
1988enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1989{
1990 long c;
1991 const char *q;
1992
1993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1994 long diff = (long)(e - p);
1995 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1996 }
1997#ifdef NONASCII_MASK
1998 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1999 uintptr_t len = 0;
2000 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2003 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (const char *)s) {
2006 if (is_utf8_lead_byte(*p)) len++;
2007 p++;
2008 }
2009 while (s < t) {
2010 len += count_utf8_lead_bytes_with_word(s);
2011 s++;
2012 }
2013 p = (const char *)s;
2014 }
2015 while (p < e) {
2016 if (is_utf8_lead_byte(*p)) len++;
2017 p++;
2018 }
2019 return (long)len;
2020 }
2021#endif
2022 else if (rb_enc_asciicompat(enc)) {
2023 c = 0;
2024 if (ENC_CODERANGE_CLEAN_P(cr)) {
2025 while (p < e) {
2026 if (ISASCII(*p)) {
2027 q = search_nonascii(p, e);
2028 if (!q)
2029 return c + (e - p);
2030 c += q - p;
2031 p = q;
2032 }
2033 p += rb_enc_fast_mbclen(p, e, enc);
2034 c++;
2035 }
2036 }
2037 else {
2038 while (p < e) {
2039 if (ISASCII(*p)) {
2040 q = search_nonascii(p, e);
2041 if (!q)
2042 return c + (e - p);
2043 c += q - p;
2044 p = q;
2045 }
2046 p += rb_enc_mbclen(p, e, enc);
2047 c++;
2048 }
2049 }
2050 return c;
2051 }
2052
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2055 }
2056 return c;
2057}
2058
2059long
2060rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2061{
2062 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2063}
2064
2065/* To get strlen with cr
2066 * Note that given cr is not used.
2067 */
2068long
2069rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2070{
2071 long c;
2072 const char *q;
2073 int ret;
2074
2075 *cr = 0;
2076 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2077 long diff = (long)(e - p);
2078 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2079 }
2080 else if (rb_enc_asciicompat(enc)) {
2081 c = 0;
2082 while (p < e) {
2083 if (ISASCII(*p)) {
2084 q = search_nonascii(p, e);
2085 if (!q) {
2086 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2087 return c + (e - p);
2088 }
2089 c += q - p;
2090 p = q;
2091 }
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2093 if (MBCLEN_CHARFOUND_P(ret)) {
2094 *cr |= ENC_CODERANGE_VALID;
2095 p += MBCLEN_CHARFOUND_LEN(ret);
2096 }
2097 else {
2099 p++;
2100 }
2101 c++;
2102 }
2103 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2104 return c;
2105 }
2106
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2109 if (MBCLEN_CHARFOUND_P(ret)) {
2110 *cr |= ENC_CODERANGE_VALID;
2111 p += MBCLEN_CHARFOUND_LEN(ret);
2112 }
2113 else {
2115 if (p + rb_enc_mbminlen(enc) <= e)
2116 p += rb_enc_mbminlen(enc);
2117 else
2118 p = e;
2119 }
2120 }
2121 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2122 return c;
2123}
2124
2125/* enc must be str's enc or rb_enc_check(str, str2) */
2126static long
2127str_strlen(VALUE str, rb_encoding *enc)
2128{
2129 const char *p, *e;
2130 int cr;
2131
2132 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2134 p = RSTRING_PTR(str);
2135 e = RSTRING_END(str);
2136 cr = ENC_CODERANGE(str);
2137
2138 if (cr == ENC_CODERANGE_UNKNOWN) {
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2140 if (cr) ENC_CODERANGE_SET(str, cr);
2141 return n;
2142 }
2143 else {
2144 return enc_strlen(p, e, enc, cr);
2145 }
2146}
2147
2148long
2150{
2151 return str_strlen(str, NULL);
2152}
2153
2154/*
2155 * call-seq:
2156 * length -> integer
2157 *
2158 * :include: doc/string/length.rdoc
2159 *
2160 */
2161
2162VALUE
2164{
2165 return LONG2NUM(str_strlen(str, NULL));
2166}
2167
2168/*
2169 * call-seq:
2170 * bytesize -> integer
2171 *
2172 * :include: doc/string/bytesize.rdoc
2173 *
2174 */
2175
2176static VALUE
2177rb_str_bytesize(VALUE str)
2178{
2179 return LONG2NUM(RSTRING_LEN(str));
2180}
2181
2182/*
2183 * call-seq:
2184 * empty? -> true or false
2185 *
2186 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2187 *
2188 * "hello".empty? # => false
2189 * " ".empty? # => false
2190 * "".empty? # => true
2191 *
2192 */
2193
2194static VALUE
2195rb_str_empty(VALUE str)
2196{
2197 return RBOOL(RSTRING_LEN(str) == 0);
2198}
2199
2200/*
2201 * call-seq:
2202 * string + other_string -> new_string
2203 *
2204 * Returns a new \String containing +other_string+ concatenated to +self+:
2205 *
2206 * "Hello from " + self.to_s # => "Hello from main"
2207 *
2208 */
2209
2210VALUE
2212{
2213 VALUE str3;
2214 rb_encoding *enc;
2215 char *ptr1, *ptr2, *ptr3;
2216 long len1, len2;
2217 int termlen;
2218
2219 StringValue(str2);
2220 enc = rb_enc_check_str(str1, str2);
2221 RSTRING_GETMEM(str1, ptr1, len1);
2222 RSTRING_GETMEM(str2, ptr2, len2);
2223 termlen = rb_enc_mbminlen(enc);
2224 if (len1 > LONG_MAX - len2) {
2225 rb_raise(rb_eArgError, "string size too big");
2226 }
2227 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2228 ptr3 = RSTRING_PTR(str3);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2232
2233 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2235 RB_GC_GUARD(str1);
2236 RB_GC_GUARD(str2);
2237 return str3;
2238}
2239
2240/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2241MJIT_FUNC_EXPORTED VALUE
2242rb_str_opt_plus(VALUE str1, VALUE str2)
2243{
2244 assert(RBASIC_CLASS(str1) == rb_cString);
2245 assert(RBASIC_CLASS(str2) == rb_cString);
2246 long len1, len2;
2247 MAYBE_UNUSED(char) *ptr1, *ptr2;
2248 RSTRING_GETMEM(str1, ptr1, len1);
2249 RSTRING_GETMEM(str2, ptr2, len2);
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2252
2253 if (enc1 < 0) {
2254 return Qundef;
2255 }
2256 else if (enc2 < 0) {
2257 return Qundef;
2258 }
2259 else if (enc1 != enc2) {
2260 return Qundef;
2261 }
2262 else if (len1 > LONG_MAX - len2) {
2263 return Qundef;
2264 }
2265 else {
2266 return rb_str_plus(str1, str2);
2267 }
2268
2269}
2270
2271/*
2272 * call-seq:
2273 * string * integer -> new_string
2274 *
2275 * Returns a new \String containing +integer+ copies of +self+:
2276 *
2277 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2278 * "Ho! " * 0 # => ""
2279 *
2280 */
2281
2282VALUE
2284{
2285 VALUE str2;
2286 long n, len;
2287 char *ptr2;
2288 int termlen;
2289
2290 if (times == INT2FIX(1)) {
2291 return str_duplicate(rb_cString, str);
2292 }
2293 if (times == INT2FIX(0)) {
2294 str2 = str_alloc_embed(rb_cString, 0);
2295 rb_enc_copy(str2, str);
2296 return str2;
2297 }
2298 len = NUM2LONG(times);
2299 if (len < 0) {
2300 rb_raise(rb_eArgError, "negative argument");
2301 }
2302 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2303 if (STR_EMBEDDABLE_P(len, 1)) {
2304 str2 = str_alloc_embed(rb_cString, len + 1);
2305 memset(RSTRING_PTR(str2), 0, len + 1);
2306 }
2307 else {
2308 str2 = str_alloc_heap(rb_cString);
2309 RSTRING(str2)->as.heap.aux.capa = len;
2310 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2311 }
2312 STR_SET_LEN(str2, len);
2313 rb_enc_copy(str2, str);
2314 return str2;
2315 }
2316 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2317 rb_raise(rb_eArgError, "argument too big");
2318 }
2319
2320 len *= RSTRING_LEN(str);
2321 termlen = TERM_LEN(str);
2322 str2 = str_new0(rb_cString, 0, len, termlen);
2323 ptr2 = RSTRING_PTR(str2);
2324 if (len) {
2325 n = RSTRING_LEN(str);
2326 memcpy(ptr2, RSTRING_PTR(str), n);
2327 while (n <= len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2329 n *= 2;
2330 }
2331 memcpy(ptr2 + n, ptr2, len-n);
2332 }
2333 STR_SET_LEN(str2, len);
2334 TERM_FILL(&ptr2[len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2336
2337 return str2;
2338}
2339
2340/*
2341 * call-seq:
2342 * string % object -> new_string
2343 *
2344 * Returns the result of formatting +object+ into the format specification +self+
2345 * (see Kernel#sprintf for formatting details):
2346 *
2347 * "%05d" % 123 # => "00123"
2348 *
2349 * If +self+ contains multiple substitutions, +object+ must be
2350 * an \Array or \Hash containing the values to be substituted:
2351 *
2352 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2353 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2354 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2355 *
2356 */
2357
2358static VALUE
2359rb_str_format_m(VALUE str, VALUE arg)
2360{
2361 VALUE tmp = rb_check_array_type(arg);
2362
2363 if (!NIL_P(tmp)) {
2364 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2365 }
2366 return rb_str_format(1, &arg, str);
2367}
2368
2369static inline void
2370rb_check_lockedtmp(VALUE str)
2371{
2372 if (FL_TEST(str, STR_TMPLOCK)) {
2373 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2374 }
2375}
2376
2377static inline void
2378str_modifiable(VALUE str)
2379{
2380 rb_check_lockedtmp(str);
2381 rb_check_frozen(str);
2382}
2383
2384static inline int
2385str_dependent_p(VALUE str)
2386{
2387 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2388 return 0;
2389 }
2390 else {
2391 return 1;
2392 }
2393}
2394
2395static inline int
2396str_independent(VALUE str)
2397{
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2400}
2401
2402static void
2403str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2404{
2405 char *ptr;
2406 char *oldptr;
2407 long capa = len + expand;
2408
2409 if (len > capa) len = capa;
2410
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2412 ptr = RSTRING(str)->as.heap.ptr;
2413 STR_SET_EMBED(str);
2414 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2416 STR_SET_EMBED_LEN(str, len);
2417 return;
2418 }
2419
2420 ptr = ALLOC_N(char, (size_t)capa + termlen);
2421 oldptr = RSTRING_PTR(str);
2422 if (oldptr) {
2423 memcpy(ptr, oldptr, len);
2424 }
2425 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2426 xfree(oldptr);
2427 }
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(ptr + len, termlen);
2431 RSTRING(str)->as.heap.ptr = ptr;
2432 RSTRING(str)->as.heap.len = len;
2433 RSTRING(str)->as.heap.aux.capa = capa;
2434}
2435
2436void
2438{
2439 if (!str_independent(str))
2440 str_make_independent(str);
2442}
2443
2444void
2446{
2447 int termlen = TERM_LEN(str);
2448 long len = RSTRING_LEN(str);
2449
2450 if (expand < 0) {
2451 rb_raise(rb_eArgError, "negative expanding string size");
2452 }
2453 if (expand >= LONG_MAX - len) {
2454 rb_raise(rb_eArgError, "string size too big");
2455 }
2456
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str, len, expand, termlen);
2459 }
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str, len + expand, termlen);
2462 }
2463}
2464
2465/* As rb_str_modify(), but don't clear coderange */
2466static void
2467str_modify_keep_cr(VALUE str)
2468{
2469 if (!str_independent(str))
2470 str_make_independent(str);
2472 /* Force re-scan later */
2474}
2475
2476static inline void
2477str_discard(VALUE str)
2478{
2479 str_modifiable(str);
2480 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2481 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2482 RSTRING(str)->as.heap.ptr = 0;
2483 RSTRING(str)->as.heap.len = 0;
2484 }
2485}
2486
2487void
2489{
2490 rb_encoding *enc = rb_enc_get(str);
2491 if (!enc) {
2492 rb_raise(rb_eTypeError, "not encoding capable object");
2493 }
2494 if (!rb_enc_asciicompat(enc)) {
2495 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2496 }
2497}
2498
2499VALUE
2501{
2502 VALUE s = *ptr;
2503 if (!RB_TYPE_P(s, T_STRING)) {
2504 s = rb_str_to_str(s);
2505 *ptr = s;
2506 }
2507 return s;
2508}
2509
2510char *
2512{
2513 VALUE str = rb_string_value(ptr);
2514 return RSTRING_PTR(str);
2515}
2516
2517static int
2518zero_filled(const char *s, int n)
2519{
2520 for (; n > 0; --n) {
2521 if (*s++) return 0;
2522 }
2523 return 1;
2524}
2525
2526static const char *
2527str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2528{
2529 const char *e = s + len;
2530
2531 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2532 if (zero_filled(s, minlen)) return s;
2533 }
2534 return 0;
2535}
2536
2537static char *
2538str_fill_term(VALUE str, char *s, long len, int termlen)
2539{
2540 /* This function assumes that (capa + termlen) bytes of memory
2541 * is allocated, like many other functions in this file.
2542 */
2543 if (str_dependent_p(str)) {
2544 if (!zero_filled(s + len, termlen))
2545 str_make_independent_expand(str, len, 0L, termlen);
2546 }
2547 else {
2548 TERM_FILL(s + len, termlen);
2549 return s;
2550 }
2551 return RSTRING_PTR(str);
2552}
2553
2554void
2555rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2556{
2557 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2558 long len = RSTRING_LEN(str);
2559
2560 assert(capa >= len);
2561 if (capa - len < termlen) {
2562 rb_check_lockedtmp(str);
2563 str_make_independent_expand(str, len, 0L, termlen);
2564 }
2565 else if (str_dependent_p(str)) {
2566 if (termlen > oldtermlen)
2567 str_make_independent_expand(str, len, 0L, termlen);
2568 }
2569 else {
2570 if (!STR_EMBED_P(str)) {
2571 /* modify capa instead of realloc */
2572 assert(!FL_TEST((str), STR_SHARED));
2573 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2574 }
2575 if (termlen > oldtermlen) {
2576 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2577 }
2578 }
2579
2580 return;
2581}
2582
2583static char *
2584str_null_check(VALUE str, int *w)
2585{
2586 char *s = RSTRING_PTR(str);
2587 long len = RSTRING_LEN(str);
2588 rb_encoding *enc = rb_enc_get(str);
2589 const int minlen = rb_enc_mbminlen(enc);
2590
2591 if (minlen > 1) {
2592 *w = 1;
2593 if (str_null_char(s, len, minlen, enc)) {
2594 return NULL;
2595 }
2596 return str_fill_term(str, s, len, minlen);
2597 }
2598 *w = 0;
2599 if (!s || memchr(s, 0, len)) {
2600 return NULL;
2601 }
2602 if (s[len]) {
2603 s = str_fill_term(str, s, len, minlen);
2604 }
2605 return s;
2606}
2607
2608char *
2609rb_str_to_cstr(VALUE str)
2610{
2611 int w;
2612 return str_null_check(str, &w);
2613}
2614
2615char *
2617{
2618 VALUE str = rb_string_value(ptr);
2619 int w;
2620 char *s = str_null_check(str, &w);
2621 if (!s) {
2622 if (w) {
2623 rb_raise(rb_eArgError, "string contains null char");
2624 }
2625 rb_raise(rb_eArgError, "string contains null byte");
2626 }
2627 return s;
2628}
2629
2630char *
2631rb_str_fill_terminator(VALUE str, const int newminlen)
2632{
2633 char *s = RSTRING_PTR(str);
2634 long len = RSTRING_LEN(str);
2635 return str_fill_term(str, s, len, newminlen);
2636}
2637
2638VALUE
2640{
2641 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2642 return str;
2643}
2644
2645/*
2646 * call-seq:
2647 * String.try_convert(object) -> object, new_string, or nil
2648 *
2649 * If +object+ is a \String object, returns +object+.
2650 *
2651 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2652 * calls <tt>object.to_str</tt> and returns the result.
2653 *
2654 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2655 *
2656 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2657 */
2658static VALUE
2659rb_str_s_try_convert(VALUE dummy, VALUE str)
2660{
2661 return rb_check_string_type(str);
2662}
2663
2664static char*
2665str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2666{
2667 long nth = *nthp;
2668 if (rb_enc_mbmaxlen(enc) == 1) {
2669 p += nth;
2670 }
2671 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2672 p += nth * rb_enc_mbmaxlen(enc);
2673 }
2674 else if (rb_enc_asciicompat(enc)) {
2675 const char *p2, *e2;
2676 int n;
2677
2678 while (p < e && 0 < nth) {
2679 e2 = p + nth;
2680 if (e < e2) {
2681 *nthp = nth;
2682 return (char *)e;
2683 }
2684 if (ISASCII(*p)) {
2685 p2 = search_nonascii(p, e2);
2686 if (!p2) {
2687 nth -= e2 - p;
2688 *nthp = nth;
2689 return (char *)e2;
2690 }
2691 nth -= p2 - p;
2692 p = p2;
2693 }
2694 n = rb_enc_mbclen(p, e, enc);
2695 p += n;
2696 nth--;
2697 }
2698 *nthp = nth;
2699 if (nth != 0) {
2700 return (char *)e;
2701 }
2702 return (char *)p;
2703 }
2704 else {
2705 while (p < e && nth--) {
2706 p += rb_enc_mbclen(p, e, enc);
2707 }
2708 }
2709 if (p > e) p = e;
2710 *nthp = nth;
2711 return (char*)p;
2712}
2713
2714char*
2715rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2716{
2717 return str_nth_len(p, e, &nth, enc);
2718}
2719
2720static char*
2721str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2722{
2723 if (singlebyte)
2724 p += nth;
2725 else {
2726 p = str_nth_len(p, e, &nth, enc);
2727 }
2728 if (!p) return 0;
2729 if (p > e) p = e;
2730 return (char *)p;
2731}
2732
2733/* char offset to byte offset */
2734static long
2735str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2736{
2737 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2738 if (!pp) return e - p;
2739 return pp - p;
2740}
2741
2742long
2743rb_str_offset(VALUE str, long pos)
2744{
2745 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2746 STR_ENC_GET(str), single_byte_optimizable(str));
2747}
2748
2749#ifdef NONASCII_MASK
2750static char *
2751str_utf8_nth(const char *p, const char *e, long *nthp)
2752{
2753 long nth = *nthp;
2754 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2755 const uintptr_t *s, *t;
2756 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2757 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2758 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2759 while (p < (const char *)s) {
2760 if (is_utf8_lead_byte(*p)) nth--;
2761 p++;
2762 }
2763 do {
2764 nth -= count_utf8_lead_bytes_with_word(s);
2765 s++;
2766 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2767 p = (char *)s;
2768 }
2769 while (p < e) {
2770 if (is_utf8_lead_byte(*p)) {
2771 if (nth == 0) break;
2772 nth--;
2773 }
2774 p++;
2775 }
2776 *nthp = nth;
2777 return (char *)p;
2778}
2779
2780static long
2781str_utf8_offset(const char *p, const char *e, long nth)
2782{
2783 const char *pp = str_utf8_nth(p, e, &nth);
2784 return pp - p;
2785}
2786#endif
2787
2788/* byte offset to char offset */
2789long
2790rb_str_sublen(VALUE str, long pos)
2791{
2792 if (single_byte_optimizable(str) || pos < 0)
2793 return pos;
2794 else {
2795 char *p = RSTRING_PTR(str);
2796 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2797 }
2798}
2799
2800static VALUE
2801str_subseq(VALUE str, long beg, long len)
2802{
2803 VALUE str2;
2804
2805 const long rstring_embed_capa_max = ((sizeof(struct RString) - offsetof(struct RString, as.embed.ary)) / sizeof(char)) - 1;
2806
2807 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str)) ||
2808 len <= rstring_embed_capa_max) {
2809 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2810 RB_GC_GUARD(str);
2811 }
2812 else {
2813 str2 = str_new_shared(rb_cString, str);
2814 ENC_CODERANGE_CLEAR(str2);
2815 RSTRING(str2)->as.heap.ptr += beg;
2816 if (RSTRING(str2)->as.heap.len > len) {
2817 RSTRING(str2)->as.heap.len = len;
2818 }
2819 }
2820
2821 return str2;
2822}
2823
2824VALUE
2825rb_str_subseq(VALUE str, long beg, long len)
2826{
2827 VALUE str2 = str_subseq(str, beg, len);
2828 rb_enc_cr_str_copy_for_substr(str2, str);
2829 return str2;
2830}
2831
2832char *
2833rb_str_subpos(VALUE str, long beg, long *lenp)
2834{
2835 long len = *lenp;
2836 long slen = -1L;
2837 long blen = RSTRING_LEN(str);
2838 rb_encoding *enc = STR_ENC_GET(str);
2839 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2840
2841 if (len < 0) return 0;
2842 if (!blen) {
2843 len = 0;
2844 }
2845 if (single_byte_optimizable(str)) {
2846 if (beg > blen) return 0;
2847 if (beg < 0) {
2848 beg += blen;
2849 if (beg < 0) return 0;
2850 }
2851 if (len > blen - beg)
2852 len = blen - beg;
2853 if (len < 0) return 0;
2854 p = s + beg;
2855 goto end;
2856 }
2857 if (beg < 0) {
2858 if (len > -beg) len = -beg;
2859 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2860 beg = -beg;
2861 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2862 p = e;
2863 if (!p) return 0;
2864 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2865 if (!p) return 0;
2866 len = e - p;
2867 goto end;
2868 }
2869 else {
2870 slen = str_strlen(str, enc);
2871 beg += slen;
2872 if (beg < 0) return 0;
2873 p = s + beg;
2874 if (len == 0) goto end;
2875 }
2876 }
2877 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2878 return 0;
2879 }
2880 if (len == 0) {
2881 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2882 p = s + beg;
2883 }
2884#ifdef NONASCII_MASK
2885 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2886 enc == rb_utf8_encoding()) {
2887 p = str_utf8_nth(s, e, &beg);
2888 if (beg > 0) return 0;
2889 len = str_utf8_offset(p, e, len);
2890 }
2891#endif
2892 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2893 int char_sz = rb_enc_mbmaxlen(enc);
2894
2895 p = s + beg * char_sz;
2896 if (p > e) {
2897 return 0;
2898 }
2899 else if (len * char_sz > e - p)
2900 len = e - p;
2901 else
2902 len *= char_sz;
2903 }
2904 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2905 if (beg > 0) return 0;
2906 len = 0;
2907 }
2908 else {
2909 len = str_offset(p, e, len, enc, 0);
2910 }
2911 end:
2912 *lenp = len;
2913 RB_GC_GUARD(str);
2914 return p;
2915}
2916
2917static VALUE str_substr(VALUE str, long beg, long len, int empty);
2918
2919VALUE
2920rb_str_substr(VALUE str, long beg, long len)
2921{
2922 return str_substr(str, beg, len, TRUE);
2923}
2924
2925static VALUE
2926str_substr(VALUE str, long beg, long len, int empty)
2927{
2928 char *p = rb_str_subpos(str, beg, &len);
2929
2930 if (!p) return Qnil;
2931 if (!len && !empty) return Qnil;
2932
2933 beg = p - RSTRING_PTR(str);
2934
2935 VALUE str2 = str_subseq(str, beg, len);
2936 rb_enc_cr_str_copy_for_substr(str2, str);
2937 return str2;
2938}
2939
2940VALUE
2942{
2943 if (OBJ_FROZEN(str)) return str;
2944 rb_str_resize(str, RSTRING_LEN(str));
2945 return rb_obj_freeze(str);
2946}
2947
2948
2949/*
2950 * call-seq:
2951 * +string -> new_string or self
2952 *
2953 * Returns +self+ if +self+ is not frozen.
2954 *
2955 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
2956 */
2957static VALUE
2958str_uplus(VALUE str)
2959{
2960 if (OBJ_FROZEN(str)) {
2961 return rb_str_dup(str);
2962 }
2963 else {
2964 return str;
2965 }
2966}
2967
2968/*
2969 * call-seq:
2970 * -string -> frozen_string
2971 *
2972 * Returns a frozen, possibly pre-existing copy of the string.
2973 *
2974 * The returned \String will be deduplicated as long as it does not have
2975 * any instance variables set on it and is not a String subclass.
2976 *
2977 * String#dedup is an alias for String#-@.
2978 */
2979static VALUE
2980str_uminus(VALUE str)
2981{
2982 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2983 str = rb_str_dup(str);
2984 }
2985 return rb_fstring(str);
2986}
2987
2988RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2989#define rb_str_dup_frozen rb_str_new_frozen
2990
2991VALUE
2993{
2994 if (FL_TEST(str, STR_TMPLOCK)) {
2995 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2996 }
2997 FL_SET(str, STR_TMPLOCK);
2998 return str;
2999}
3000
3001VALUE
3003{
3004 if (!FL_TEST(str, STR_TMPLOCK)) {
3005 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3006 }
3007 FL_UNSET(str, STR_TMPLOCK);
3008 return str;
3009}
3010
3011RUBY_FUNC_EXPORTED VALUE
3012rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3013{
3014 rb_str_locktmp(str);
3015 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3016}
3017
3018void
3020{
3021 long capa;
3022 const int termlen = TERM_LEN(str);
3023
3024 str_modifiable(str);
3025 if (STR_SHARED_P(str)) {
3026 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3027 }
3028 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3029 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3030 }
3031 STR_SET_LEN(str, len);
3032 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3033}
3034
3035VALUE
3037{
3038 if (len < 0) {
3039 rb_raise(rb_eArgError, "negative string size (or size too big)");
3040 }
3041
3042 int independent = str_independent(str);
3043 long slen = RSTRING_LEN(str);
3044
3045 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3047 }
3048
3049 {
3050 long capa;
3051 const int termlen = TERM_LEN(str);
3052 if (STR_EMBED_P(str)) {
3053 if (len == slen) return str;
3054 if (str_embed_capa(str) >= len + termlen) {
3055 STR_SET_EMBED_LEN(str, len);
3056 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3057 return str;
3058 }
3059 str_make_independent_expand(str, slen, len - slen, termlen);
3060 }
3061 else if (str_embed_capa(str) >= len + termlen) {
3062 char *ptr = STR_HEAP_PTR(str);
3063 STR_SET_EMBED(str);
3064 if (slen > len) slen = len;
3065 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3066 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3067 STR_SET_EMBED_LEN(str, len);
3068 if (independent) ruby_xfree(ptr);
3069 return str;
3070 }
3071 else if (!independent) {
3072 if (len == slen) return str;
3073 str_make_independent_expand(str, slen, len - slen, termlen);
3074 }
3075 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3076 (capa - len) > (len < 1024 ? len : 1024)) {
3077 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3078 (size_t)len + termlen, STR_HEAP_SIZE(str));
3079 RSTRING(str)->as.heap.aux.capa = len;
3080 }
3081 else if (len == slen) return str;
3082 RSTRING(str)->as.heap.len = len;
3083 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3084 }
3085 return str;
3086}
3087
3088static VALUE
3089str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3090{
3091 if (keep_cr) {
3092 str_modify_keep_cr(str);
3093 }
3094 else {
3095 rb_str_modify(str);
3096 }
3097 if (len == 0) return 0;
3098
3099 long capa, total, olen, off = -1;
3100 char *sptr;
3101 const int termlen = TERM_LEN(str);
3102#if !USE_RVARGC
3103 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3104#endif
3105
3106 RSTRING_GETMEM(str, sptr, olen);
3107 if (ptr >= sptr && ptr <= sptr + olen) {
3108 off = ptr - sptr;
3109 }
3110
3111 if (STR_EMBED_P(str)) {
3112 capa = str_embed_capa(str) - termlen;
3113 sptr = RSTRING(str)->as.embed.ary;
3114 olen = RSTRING_EMBED_LEN(str);
3115 }
3116 else {
3117 capa = RSTRING(str)->as.heap.aux.capa;
3118 sptr = RSTRING(str)->as.heap.ptr;
3119 olen = RSTRING(str)->as.heap.len;
3120 }
3121 if (olen > LONG_MAX - len) {
3122 rb_raise(rb_eArgError, "string sizes too big");
3123 }
3124 total = olen + len;
3125 if (capa < total) {
3126 if (total >= LONG_MAX / 2) {
3127 capa = total;
3128 }
3129 while (total > capa) {
3130 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3131 }
3132 RESIZE_CAPA_TERM(str, capa, termlen);
3133 sptr = RSTRING_PTR(str);
3134 }
3135 if (off != -1) {
3136 ptr = sptr + off;
3137 }
3138 memcpy(sptr + olen, ptr, len);
3139 STR_SET_LEN(str, total);
3140 TERM_FILL(sptr + total, termlen); /* sentinel */
3141
3142 return str;
3143}
3144
3145#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3146#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3147
3148VALUE
3149rb_str_cat(VALUE str, const char *ptr, long len)
3150{
3151 if (len == 0) return str;
3152 if (len < 0) {
3153 rb_raise(rb_eArgError, "negative string size (or size too big)");
3154 }
3155 return str_buf_cat(str, ptr, len);
3156}
3157
3158VALUE
3159rb_str_cat_cstr(VALUE str, const char *ptr)
3160{
3161 must_not_null(ptr);
3162 return rb_str_buf_cat(str, ptr, strlen(ptr));
3163}
3164
3165RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3166RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3167RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3168
3169static VALUE
3170rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3171 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3172{
3173 int str_encindex = ENCODING_GET(str);
3174 int res_encindex;
3175 int str_cr, res_cr;
3176 rb_encoding *str_enc, *ptr_enc;
3177
3178 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3179
3180 if (str_encindex == ptr_encindex) {
3181 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3182 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3183 }
3184 }
3185 else {
3186 str_enc = rb_enc_from_index(str_encindex);
3187 ptr_enc = rb_enc_from_index(ptr_encindex);
3188 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3189 if (len == 0)
3190 return str;
3191 if (RSTRING_LEN(str) == 0) {
3192 rb_str_buf_cat(str, ptr, len);
3193 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3194 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3195 return str;
3196 }
3197 goto incompatible;
3198 }
3199 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3200 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3201 }
3202 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3203 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3204 str_cr = rb_enc_str_coderange(str);
3205 }
3206 }
3207 }
3208 if (ptr_cr_ret)
3209 *ptr_cr_ret = ptr_cr;
3210
3211 if (str_encindex != ptr_encindex &&
3212 str_cr != ENC_CODERANGE_7BIT &&
3213 ptr_cr != ENC_CODERANGE_7BIT) {
3214 str_enc = rb_enc_from_index(str_encindex);
3215 ptr_enc = rb_enc_from_index(ptr_encindex);
3216 goto incompatible;
3217 }
3218
3219 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3220 res_encindex = str_encindex;
3221 res_cr = ENC_CODERANGE_UNKNOWN;
3222 }
3223 else if (str_cr == ENC_CODERANGE_7BIT) {
3224 if (ptr_cr == ENC_CODERANGE_7BIT) {
3225 res_encindex = str_encindex;
3226 res_cr = ENC_CODERANGE_7BIT;
3227 }
3228 else {
3229 res_encindex = ptr_encindex;
3230 res_cr = ptr_cr;
3231 }
3232 }
3233 else if (str_cr == ENC_CODERANGE_VALID) {
3234 res_encindex = str_encindex;
3235 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3236 res_cr = str_cr;
3237 else
3238 res_cr = ptr_cr;
3239 }
3240 else { /* str_cr == ENC_CODERANGE_BROKEN */
3241 res_encindex = str_encindex;
3242 res_cr = str_cr;
3243 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3244 }
3245
3246 if (len < 0) {
3247 rb_raise(rb_eArgError, "negative string size (or size too big)");
3248 }
3249 str_buf_cat(str, ptr, len);
3250 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3251 return str;
3252
3253 incompatible:
3254 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3255 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3257}
3258
3259VALUE
3260rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3261{
3262 return rb_enc_cr_str_buf_cat(str, ptr, len,
3263 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3264}
3265
3266VALUE
3268{
3269 /* ptr must reference NUL terminated ASCII string. */
3270 int encindex = ENCODING_GET(str);
3271 rb_encoding *enc = rb_enc_from_index(encindex);
3272 if (rb_enc_asciicompat(enc)) {
3273 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3274 encindex, ENC_CODERANGE_7BIT, 0);
3275 }
3276 else {
3277 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3278 while (*ptr) {
3279 unsigned int c = (unsigned char)*ptr;
3280 int len = rb_enc_codelen(c, enc);
3281 rb_enc_mbcput(c, buf, enc);
3282 rb_enc_cr_str_buf_cat(str, buf, len,
3283 encindex, ENC_CODERANGE_VALID, 0);
3284 ptr++;
3285 }
3286 return str;
3287 }
3288}
3289
3290VALUE
3292{
3293 int str2_cr = rb_enc_str_coderange(str2);
3294
3295 if (str_enc_fastpath(str)) {
3296 switch (str2_cr) {
3297 case ENC_CODERANGE_7BIT:
3298 // If RHS is 7bit we can do simple concatenation
3299 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3300 return str;
3302 // If RHS is valid, we can do simple concatenation if encodings are the same
3303 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3304 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3305 int str_cr = ENC_CODERANGE(str);
3306 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3307 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3308 }
3309 return str;
3310 }
3311 }
3312 }
3313
3314 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3315 ENCODING_GET(str2), str2_cr, &str2_cr);
3316
3317 ENC_CODERANGE_SET(str2, str2_cr);
3318
3319 return str;
3320}
3321
3322VALUE
3324{
3325 StringValue(str2);
3326 return rb_str_buf_append(str, str2);
3327}
3328
3329#define MIN_PRE_ALLOC_SIZE 48
3330
3331MJIT_FUNC_EXPORTED VALUE
3332rb_str_concat_literals(size_t num, const VALUE *strary)
3333{
3334 VALUE str;
3335 size_t i, s;
3336 long len = 1;
3337
3338 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3339 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3340
3341 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3342 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3343 str = rb_str_resurrect(strary[0]);
3344 s = 1;
3345 }
3346 else {
3347 str = rb_str_buf_new(len);
3348 rb_enc_copy(str, strary[0]);
3349 s = 0;
3350 }
3351
3352 for (i = s; i < num; ++i) {
3353 const VALUE v = strary[i];
3354 int encidx = ENCODING_GET(v);
3355
3356 rb_str_buf_append(str, v);
3357 if (encidx != ENCINDEX_US_ASCII) {
3358 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3359 rb_enc_set_index(str, encidx);
3360 }
3361 }
3362 return str;
3363}
3364
3365/*
3366 * call-seq:
3367 * concat(*objects) -> string
3368 *
3369 * Concatenates each object in +objects+ to +self+ and returns +self+:
3370 *
3371 * s = 'foo'
3372 * s.concat('bar', 'baz') # => "foobarbaz"
3373 * s # => "foobarbaz"
3374 *
3375 * For each given object +object+ that is an \Integer,
3376 * the value is considered a codepoint and converted to a character before concatenation:
3377 *
3378 * s = 'foo'
3379 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3380 *
3381 * Related: String#<<, which takes a single argument.
3382 */
3383static VALUE
3384rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3385{
3386 str_modifiable(str);
3387
3388 if (argc == 1) {
3389 return rb_str_concat(str, argv[0]);
3390 }
3391 else if (argc > 1) {
3392 int i;
3393 VALUE arg_str = rb_str_tmp_new(0);
3394 rb_enc_copy(arg_str, str);
3395 for (i = 0; i < argc; i++) {
3396 rb_str_concat(arg_str, argv[i]);
3397 }
3398 rb_str_buf_append(str, arg_str);
3399 }
3400
3401 return str;
3402}
3403
3404/*
3405 * call-seq:
3406 * string << object -> string
3407 *
3408 * Concatenates +object+ to +self+ and returns +self+:
3409 *
3410 * s = 'foo'
3411 * s << 'bar' # => "foobar"
3412 * s # => "foobar"
3413 *
3414 * If +object+ is an \Integer,
3415 * the value is considered a codepoint and converted to a character before concatenation:
3416 *
3417 * s = 'foo'
3418 * s << 33 # => "foo!"
3419 *
3420 * Related: String#concat, which takes multiple arguments.
3421 */
3422VALUE
3424{
3425 unsigned int code;
3426 rb_encoding *enc = STR_ENC_GET(str1);
3427 int encidx;
3428
3429 if (RB_INTEGER_TYPE_P(str2)) {
3430 if (rb_num_to_uint(str2, &code) == 0) {
3431 }
3432 else if (FIXNUM_P(str2)) {
3433 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3434 }
3435 else {
3436 rb_raise(rb_eRangeError, "bignum out of char range");
3437 }
3438 }
3439 else {
3440 return rb_str_append(str1, str2);
3441 }
3442
3443 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3444 if (encidx >= 0) {
3445 char buf[1];
3446 buf[0] = (char)code;
3447 rb_str_cat(str1, buf, 1);
3448 if (encidx != rb_enc_to_index(enc)) {
3449 rb_enc_associate_index(str1, encidx);
3451 }
3452 }
3453 else {
3454 long pos = RSTRING_LEN(str1);
3455 int cr = ENC_CODERANGE(str1);
3456 int len;
3457 char *buf;
3458
3459 switch (len = rb_enc_codelen(code, enc)) {
3460 case ONIGERR_INVALID_CODE_POINT_VALUE:
3461 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3462 break;
3463 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3464 case 0:
3465 rb_raise(rb_eRangeError, "%u out of char range", code);
3466 break;
3467 }
3468 buf = ALLOCA_N(char, len + 1);
3469 rb_enc_mbcput(code, buf, enc);
3470 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3471 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3472 }
3473 rb_str_resize(str1, pos+len);
3474 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3475 if (cr == ENC_CODERANGE_7BIT && code > 127)
3477 ENC_CODERANGE_SET(str1, cr);
3478 }
3479 return str1;
3480}
3481
3482int
3483rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3484{
3485 int encidx = rb_enc_to_index(enc);
3486
3487 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3488 /* US-ASCII automatically extended to ASCII-8BIT */
3489 if (code > 0xFF) {
3490 rb_raise(rb_eRangeError, "%u out of char range", code);
3491 }
3492 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3493 return ENCINDEX_ASCII_8BIT;
3494 }
3495 return encidx;
3496 }
3497 else {
3498 return -1;
3499 }
3500}
3501
3502/*
3503 * call-seq:
3504 * prepend(*other_strings) -> string
3505 *
3506 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3507 *
3508 * s = 'foo'
3509 * s.prepend('bar', 'baz') # => "barbazfoo"
3510 * s # => "barbazfoo"
3511 *
3512 * Related: String#concat.
3513 */
3514
3515static VALUE
3516rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3517{
3518 str_modifiable(str);
3519
3520 if (argc == 1) {
3521 rb_str_update(str, 0L, 0L, argv[0]);
3522 }
3523 else if (argc > 1) {
3524 int i;
3525 VALUE arg_str = rb_str_tmp_new(0);
3526 rb_enc_copy(arg_str, str);
3527 for (i = 0; i < argc; i++) {
3528 rb_str_append(arg_str, argv[i]);
3529 }
3530 rb_str_update(str, 0L, 0L, arg_str);
3531 }
3532
3533 return str;
3534}
3535
3536st_index_t
3538{
3539 int e = ENCODING_GET(str);
3540 if (e && is_ascii_string(str)) {
3541 e = 0;
3542 }
3543 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3544}
3545
3546int
3548{
3549 long len1, len2;
3550 const char *ptr1, *ptr2;
3551 RSTRING_GETMEM(str1, ptr1, len1);
3552 RSTRING_GETMEM(str2, ptr2, len2);
3553 return (len1 != len2 ||
3554 !rb_str_comparable(str1, str2) ||
3555 memcmp(ptr1, ptr2, len1) != 0);
3556}
3557
3558/*
3559 * call-seq:
3560 * hash -> integer
3561 *
3562 * Returns the integer hash value for +self+.
3563 * The value is based on the length, content and encoding of +self+.
3564 *
3565 * Related: Object#hash.
3566 */
3567
3568static VALUE
3569rb_str_hash_m(VALUE str)
3570{
3571 st_index_t hval = rb_str_hash(str);
3572 return ST2FIX(hval);
3573}
3574
3575#define lesser(a,b) (((a)>(b))?(b):(a))
3576
3577int
3579{
3580 int idx1, idx2;
3581 int rc1, rc2;
3582
3583 if (RSTRING_LEN(str1) == 0) return TRUE;
3584 if (RSTRING_LEN(str2) == 0) return TRUE;
3585 idx1 = ENCODING_GET(str1);
3586 idx2 = ENCODING_GET(str2);
3587 if (idx1 == idx2) return TRUE;
3588 rc1 = rb_enc_str_coderange(str1);
3589 rc2 = rb_enc_str_coderange(str2);
3590 if (rc1 == ENC_CODERANGE_7BIT) {
3591 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3592 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3593 return TRUE;
3594 }
3595 if (rc2 == ENC_CODERANGE_7BIT) {
3596 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3597 return TRUE;
3598 }
3599 return FALSE;
3600}
3601
3602int
3604{
3605 long len1, len2;
3606 const char *ptr1, *ptr2;
3607 int retval;
3608
3609 if (str1 == str2) return 0;
3610 RSTRING_GETMEM(str1, ptr1, len1);
3611 RSTRING_GETMEM(str2, ptr2, len2);
3612 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3613 if (len1 == len2) {
3614 if (!rb_str_comparable(str1, str2)) {
3615 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3616 return 1;
3617 return -1;
3618 }
3619 return 0;
3620 }
3621 if (len1 > len2) return 1;
3622 return -1;
3623 }
3624 if (retval > 0) return 1;
3625 return -1;
3626}
3627
3628/*
3629 * call-seq:
3630 * string == object -> true or false
3631 * string === object -> true or false
3632 *
3633 * Returns +true+ if +object+ has the same length and content;
3634 * as +self+; +false+ otherwise:
3635 *
3636 * s = 'foo'
3637 * s == 'foo' # => true
3638 * s == 'food' # => false
3639 * s == 'FOO' # => false
3640 *
3641 * Returns +false+ if the two strings' encodings are not compatible:
3642 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3643 *
3644 * If +object+ is not an instance of \String but responds to +to_str+, then the
3645 * two strings are compared using <code>object.==</code>.
3646 */
3647
3648VALUE
3650{
3651 if (str1 == str2) return Qtrue;
3652 if (!RB_TYPE_P(str2, T_STRING)) {
3653 if (!rb_respond_to(str2, idTo_str)) {
3654 return Qfalse;
3655 }
3656 return rb_equal(str2, str1);
3657 }
3658 return rb_str_eql_internal(str1, str2);
3659}
3660
3661/*
3662 * call-seq:
3663 * eql?(object) -> true or false
3664 *
3665 * Returns +true+ if +object+ has the same length and content;
3666 * as +self+; +false+ otherwise:
3667 *
3668 * s = 'foo'
3669 * s.eql?('foo') # => true
3670 * s.eql?('food') # => false
3671 * s.eql?('FOO') # => false
3672 *
3673 * Returns +false+ if the two strings' encodings are not compatible:
3674 *
3675 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3676 *
3677 */
3678
3679MJIT_FUNC_EXPORTED VALUE
3680rb_str_eql(VALUE str1, VALUE str2)
3681{
3682 if (str1 == str2) return Qtrue;
3683 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3684 return rb_str_eql_internal(str1, str2);
3685}
3686
3687/*
3688 * call-seq:
3689 * string <=> other_string -> -1, 0, 1, or nil
3690 *
3691 * Compares +self+ and +other_string+, returning:
3692 *
3693 * - -1 if +other_string+ is larger.
3694 * - 0 if the two are equal.
3695 * - 1 if +other_string+ is smaller.
3696 * - +nil+ if the two are incomparable.
3697 *
3698 * Examples:
3699 *
3700 * 'foo' <=> 'foo' # => 0
3701 * 'foo' <=> 'food' # => -1
3702 * 'food' <=> 'foo' # => 1
3703 * 'FOO' <=> 'foo' # => -1
3704 * 'foo' <=> 'FOO' # => 1
3705 * 'foo' <=> 1 # => nil
3706 *
3707 */
3708
3709static VALUE
3710rb_str_cmp_m(VALUE str1, VALUE str2)
3711{
3712 int result;
3713 VALUE s = rb_check_string_type(str2);
3714 if (NIL_P(s)) {
3715 return rb_invcmp(str1, str2);
3716 }
3717 result = rb_str_cmp(str1, s);
3718 return INT2FIX(result);
3719}
3720
3721static VALUE str_casecmp(VALUE str1, VALUE str2);
3722static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3723
3724/*
3725 * call-seq:
3726 * casecmp(other_string) -> -1, 0, 1, or nil
3727 *
3728 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3729 *
3730 * - -1 if <tt>other_string.downcase</tt> is larger.
3731 * - 0 if the two are equal.
3732 * - 1 if <tt>other_string.downcase</tt> is smaller.
3733 * - +nil+ if the two are incomparable.
3734 *
3735 * Examples:
3736 *
3737 * 'foo'.casecmp('foo') # => 0
3738 * 'foo'.casecmp('food') # => -1
3739 * 'food'.casecmp('foo') # => 1
3740 * 'FOO'.casecmp('foo') # => 0
3741 * 'foo'.casecmp('FOO') # => 0
3742 * 'foo'.casecmp(1) # => nil
3743 *
3744 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3745 *
3746 * Related: String#casecmp?.
3747 *
3748 */
3749
3750static VALUE
3751rb_str_casecmp(VALUE str1, VALUE str2)
3752{
3753 VALUE s = rb_check_string_type(str2);
3754 if (NIL_P(s)) {
3755 return Qnil;
3756 }
3757 return str_casecmp(str1, s);
3758}
3759
3760static VALUE
3761str_casecmp(VALUE str1, VALUE str2)
3762{
3763 long len;
3764 rb_encoding *enc;
3765 const char *p1, *p1end, *p2, *p2end;
3766
3767 enc = rb_enc_compatible(str1, str2);
3768 if (!enc) {
3769 return Qnil;
3770 }
3771
3772 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3773 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3774 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3775 while (p1 < p1end && p2 < p2end) {
3776 if (*p1 != *p2) {
3777 unsigned int c1 = TOLOWER(*p1 & 0xff);
3778 unsigned int c2 = TOLOWER(*p2 & 0xff);
3779 if (c1 != c2)
3780 return INT2FIX(c1 < c2 ? -1 : 1);
3781 }
3782 p1++;
3783 p2++;
3784 }
3785 }
3786 else {
3787 while (p1 < p1end && p2 < p2end) {
3788 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3789 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3790
3791 if (0 <= c1 && 0 <= c2) {
3792 c1 = TOLOWER(c1);
3793 c2 = TOLOWER(c2);
3794 if (c1 != c2)
3795 return INT2FIX(c1 < c2 ? -1 : 1);
3796 }
3797 else {
3798 int r;
3799 l1 = rb_enc_mbclen(p1, p1end, enc);
3800 l2 = rb_enc_mbclen(p2, p2end, enc);
3801 len = l1 < l2 ? l1 : l2;
3802 r = memcmp(p1, p2, len);
3803 if (r != 0)
3804 return INT2FIX(r < 0 ? -1 : 1);
3805 if (l1 != l2)
3806 return INT2FIX(l1 < l2 ? -1 : 1);
3807 }
3808 p1 += l1;
3809 p2 += l2;
3810 }
3811 }
3812 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3813 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3814 return INT2FIX(-1);
3815}
3816
3817/*
3818 * call-seq:
3819 * casecmp?(other_string) -> true, false, or nil
3820 *
3821 * Returns +true+ if +self+ and +other_string+ are equal after
3822 * Unicode case folding, otherwise +false+:
3823 *
3824 * 'foo'.casecmp?('foo') # => true
3825 * 'foo'.casecmp?('food') # => false
3826 * 'food'.casecmp?('foo') # => false
3827 * 'FOO'.casecmp?('foo') # => true
3828 * 'foo'.casecmp?('FOO') # => true
3829 *
3830 * Returns +nil+ if the two values are incomparable:
3831 *
3832 * 'foo'.casecmp?(1) # => nil
3833 *
3834 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3835 *
3836 * Related: String#casecmp.
3837 *
3838 */
3839
3840static VALUE
3841rb_str_casecmp_p(VALUE str1, VALUE str2)
3842{
3843 VALUE s = rb_check_string_type(str2);
3844 if (NIL_P(s)) {
3845 return Qnil;
3846 }
3847 return str_casecmp_p(str1, s);
3848}
3849
3850static VALUE
3851str_casecmp_p(VALUE str1, VALUE str2)
3852{
3853 rb_encoding *enc;
3854 VALUE folded_str1, folded_str2;
3855 VALUE fold_opt = sym_fold;
3856
3857 enc = rb_enc_compatible(str1, str2);
3858 if (!enc) {
3859 return Qnil;
3860 }
3861
3862 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3863 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3864
3865 return rb_str_eql(folded_str1, folded_str2);
3866}
3867
3868static long
3869strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3870 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3871{
3872 const char *search_start = str_ptr;
3873 long pos, search_len = str_len - offset;
3874
3875 for (;;) {
3876 const char *t;
3877 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3878 if (pos < 0) return pos;
3879 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3880 if (t == search_start + pos) break;
3881 search_len -= t - search_start;
3882 if (search_len <= 0) return -1;
3883 offset += t - search_start;
3884 search_start = t;
3885 }
3886 return pos + offset;
3887}
3888
3889#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3890
3891static long
3892rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3893{
3894 const char *str_ptr, *str_ptr_end, *sub_ptr;
3895 long str_len, sub_len;
3896 rb_encoding *enc;
3897
3898 enc = rb_enc_check(str, sub);
3899 if (is_broken_string(sub)) return -1;
3900
3901 str_ptr = RSTRING_PTR(str);
3902 str_ptr_end = RSTRING_END(str);
3903 str_len = RSTRING_LEN(str);
3904 sub_ptr = RSTRING_PTR(sub);
3905 sub_len = RSTRING_LEN(sub);
3906
3907 if (str_len < sub_len) return -1;
3908
3909 if (offset != 0) {
3910 long str_len_char, sub_len_char;
3911 int single_byte = single_byte_optimizable(str);
3912 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3913 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3914 if (offset < 0) {
3915 offset += str_len_char;
3916 if (offset < 0) return -1;
3917 }
3918 if (str_len_char - offset < sub_len_char) return -1;
3919 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3920 str_ptr += offset;
3921 }
3922 if (sub_len == 0) return offset;
3923
3924 /* need proceed one character at a time */
3925 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3926}
3927
3928
3929/*
3930 * call-seq:
3931 * index(substring, offset = 0) -> integer or nil
3932 * index(regexp, offset = 0) -> integer or nil
3933 *
3934 * :include: doc/string/index.rdoc
3935 *
3936 */
3937
3938static VALUE
3939rb_str_index_m(int argc, VALUE *argv, VALUE str)
3940{
3941 VALUE sub;
3942 VALUE initpos;
3943 long pos;
3944
3945 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3946 pos = NUM2LONG(initpos);
3947 }
3948 else {
3949 pos = 0;
3950 }
3951 if (pos < 0) {
3952 pos += str_strlen(str, NULL);
3953 if (pos < 0) {
3954 if (RB_TYPE_P(sub, T_REGEXP)) {
3956 }
3957 return Qnil;
3958 }
3959 }
3960
3961 if (RB_TYPE_P(sub, T_REGEXP)) {
3962 if (pos > str_strlen(str, NULL))
3963 return Qnil;
3964 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3965 rb_enc_check(str, sub), single_byte_optimizable(str));
3966
3967 if (rb_reg_search(sub, str, pos, 0) < 0) {
3968 return Qnil;
3969 }
3970 else {
3971 VALUE match = rb_backref_get();
3972 struct re_registers *regs = RMATCH_REGS(match);
3973 pos = rb_str_sublen(str, BEG(0));
3974 return LONG2NUM(pos);
3975 }
3976 }
3977 else {
3978 StringValue(sub);
3979 pos = rb_str_index(str, sub, pos);
3980 pos = rb_str_sublen(str, pos);
3981 }
3982
3983 if (pos == -1) return Qnil;
3984 return LONG2NUM(pos);
3985}
3986
3987/* whether given pos is valid character boundary or not
3988 * Note that in this function, "character" means a code point
3989 * (Unicode scalar value), not a grapheme cluster.
3990 */
3991static bool
3992str_check_byte_pos(VALUE str, long pos)
3993{
3994 const char *s = RSTRING_PTR(str);
3995 const char *e = RSTRING_END(str);
3996 const char *p = s + pos;
3997 const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str));
3998 return p == pp;
3999}
4000
4001/*
4002 * call-seq:
4003 * byteindex(substring, offset = 0) -> integer or nil
4004 * byteindex(regexp, offset = 0) -> integer or nil
4005 *
4006 * Returns the \Integer byte-based index of the first occurrence of the given +substring+,
4007 * or +nil+ if none found:
4008 *
4009 * 'foo'.byteindex('f') # => 0
4010 * 'foo'.byteindex('o') # => 1
4011 * 'foo'.byteindex('oo') # => 1
4012 * 'foo'.byteindex('ooo') # => nil
4013 *
4014 * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+,
4015 * or +nil+ if none found:
4016 *
4017 * 'foo'.byteindex(/f/) # => 0
4018 * 'foo'.byteindex(/o/) # => 1
4019 * 'foo'.byteindex(/oo/) # => 1
4020 * 'foo'.byteindex(/ooo/) # => nil
4021 *
4022 * \Integer argument +offset+, if given, specifies the byte-based position in the
4023 * string to begin the search:
4024 *
4025 * 'foo'.byteindex('o', 1) # => 1
4026 * 'foo'.byteindex('o', 2) # => 2
4027 * 'foo'.byteindex('o', 3) # => nil
4028 *
4029 * If +offset+ is negative, counts backward from the end of +self+:
4030 *
4031 * 'foo'.byteindex('o', -1) # => 2
4032 * 'foo'.byteindex('o', -2) # => 1
4033 * 'foo'.byteindex('o', -3) # => 1
4034 * 'foo'.byteindex('o', -4) # => nil
4035 *
4036 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4037 * raised.
4038 *
4039 * Related: String#index, String#byterindex.
4040 */
4041
4042static VALUE
4043rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4044{
4045 VALUE sub;
4046 VALUE initpos;
4047 long pos;
4048
4049 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4050 pos = NUM2LONG(initpos);
4051 }
4052 else {
4053 pos = 0;
4054 }
4055 if (pos < 0) {
4056 pos += RSTRING_LEN(str);
4057 if (pos < 0) {
4058 if (RB_TYPE_P(sub, T_REGEXP)) {
4060 }
4061 return Qnil;
4062 }
4063 }
4064
4065 if (!str_check_byte_pos(str, pos)) {
4067 "offset %ld does not land on character boundary", pos);
4068 }
4069
4070 if (RB_TYPE_P(sub, T_REGEXP)) {
4071 if (pos > RSTRING_LEN(str))
4072 return Qnil;
4073 if (rb_reg_search(sub, str, pos, 0) < 0) {
4074 return Qnil;
4075 }
4076 else {
4077 VALUE match = rb_backref_get();
4078 struct re_registers *regs = RMATCH_REGS(match);
4079 pos = BEG(0);
4080 return LONG2NUM(pos);
4081 }
4082 }
4083 else {
4084 StringValue(sub);
4085 pos = rb_strseq_index(str, sub, pos, 1);
4086 }
4087
4088 if (pos == -1) return Qnil;
4089 return LONG2NUM(pos);
4090}
4091
4092#ifdef HAVE_MEMRCHR
4093static long
4094str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4095{
4096 char *hit, *adjusted;
4097 int c;
4098 long slen, searchlen;
4099 char *sbeg, *e, *t;
4100
4101 sbeg = RSTRING_PTR(str);
4102 slen = RSTRING_LEN(sub);
4103 if (slen == 0) return s - sbeg;
4104 e = RSTRING_END(str);
4105 t = RSTRING_PTR(sub);
4106 c = *t & 0xff;
4107 searchlen = s - sbeg + 1;
4108
4109 do {
4110 hit = memrchr(sbeg, c, searchlen);
4111 if (!hit) break;
4112 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4113 if (hit != adjusted) {
4114 searchlen = adjusted - sbeg;
4115 continue;
4116 }
4117 if (memcmp(hit, t, slen) == 0)
4118 return hit - sbeg;
4119 searchlen = adjusted - sbeg;
4120 } while (searchlen > 0);
4121
4122 return -1;
4123}
4124#else
4125static long
4126str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4127{
4128 long slen;
4129 char *sbeg, *e, *t;
4130
4131 sbeg = RSTRING_PTR(str);
4132 e = RSTRING_END(str);
4133 t = RSTRING_PTR(sub);
4134 slen = RSTRING_LEN(sub);
4135
4136 while (s) {
4137 if (memcmp(s, t, slen) == 0) {
4138 return s - sbeg;
4139 }
4140 if (s <= sbeg) break;
4141 s = rb_enc_prev_char(sbeg, s, e, enc);
4142 }
4143
4144 return -1;
4145}
4146#endif
4147
4148static long
4149rb_str_rindex(VALUE str, VALUE sub, long pos)
4150{
4151 long len, slen;
4152 char *sbeg, *s;
4153 rb_encoding *enc;
4154 int singlebyte;
4155
4156 enc = rb_enc_check(str, sub);
4157 if (is_broken_string(sub)) return -1;
4158 singlebyte = single_byte_optimizable(str);
4159 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4160 slen = str_strlen(sub, enc); /* rb_enc_check */
4161
4162 /* substring longer than string */
4163 if (len < slen) return -1;
4164 if (len - pos < slen) pos = len - slen;
4165 if (len == 0) return pos;
4166
4167 sbeg = RSTRING_PTR(str);
4168
4169 if (pos == 0) {
4170 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4171 return 0;
4172 else
4173 return -1;
4174 }
4175
4176 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4177 return rb_str_sublen(str, str_rindex(str, sub, s, enc));
4178}
4179
4180/*
4181 * call-seq:
4182 * rindex(substring, offset = self.length) -> integer or nil
4183 * rindex(regexp, offset = self.length) -> integer or nil
4184 *
4185 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4186 * or +nil+ if none found:
4187 *
4188 * 'foo'.rindex('f') # => 0
4189 * 'foo'.rindex('o') # => 2
4190 * 'foo'.rindex('oo') # => 1
4191 * 'foo'.rindex('ooo') # => nil
4192 *
4193 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4194 * or +nil+ if none found:
4195 *
4196 * 'foo'.rindex(/f/) # => 0
4197 * 'foo'.rindex(/o/) # => 2
4198 * 'foo'.rindex(/oo/) # => 1
4199 * 'foo'.rindex(/ooo/) # => nil
4200 *
4201 * The _last_ match means starting at the possible last position, not
4202 * the last of longest matches.
4203 *
4204 * 'foo'.rindex(/o+/) # => 2
4205 * $~ #=> #<MatchData "o">
4206 *
4207 * To get the last longest match, needs to combine with negative
4208 * lookbehind.
4209 *
4210 * 'foo'.rindex(/(?<!o)o+/) # => 1
4211 * $~ #=> #<MatchData "oo">
4212 *
4213 * Or String#index with negative lookforward.
4214 *
4215 * 'foo'.index(/o+(?!.*o)/) # => 1
4216 * $~ #=> #<MatchData "oo">
4217 *
4218 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4219 * string to _end_ the search:
4220 *
4221 * 'foo'.rindex('o', 0) # => nil
4222 * 'foo'.rindex('o', 1) # => 1
4223 * 'foo'.rindex('o', 2) # => 2
4224 * 'foo'.rindex('o', 3) # => 2
4225 *
4226 * If +offset+ is a negative \Integer, the maximum starting position in the
4227 * string to _end_ the search is the sum of the string's length and +offset+:
4228 *
4229 * 'foo'.rindex('o', -1) # => 2
4230 * 'foo'.rindex('o', -2) # => 1
4231 * 'foo'.rindex('o', -3) # => nil
4232 * 'foo'.rindex('o', -4) # => nil
4233 *
4234 * Related: String#index.
4235 */
4236
4237static VALUE
4238rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4239{
4240 VALUE sub;
4241 VALUE vpos;
4242 rb_encoding *enc = STR_ENC_GET(str);
4243 long pos, len = str_strlen(str, enc); /* str's enc */
4244
4245 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4246 pos = NUM2LONG(vpos);
4247 if (pos < 0) {
4248 pos += len;
4249 if (pos < 0) {
4250 if (RB_TYPE_P(sub, T_REGEXP)) {
4252 }
4253 return Qnil;
4254 }
4255 }
4256 if (pos > len) pos = len;
4257 }
4258 else {
4259 pos = len;
4260 }
4261
4262 if (RB_TYPE_P(sub, T_REGEXP)) {
4263 /* enc = rb_get_check(str, sub); */
4264 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4265 enc, single_byte_optimizable(str));
4266
4267 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4268 VALUE match = rb_backref_get();
4269 struct re_registers *regs = RMATCH_REGS(match);
4270 pos = rb_str_sublen(str, BEG(0));
4271 return LONG2NUM(pos);
4272 }
4273 }
4274 else {
4275 StringValue(sub);
4276 pos = rb_str_rindex(str, sub, pos);
4277 if (pos >= 0) return LONG2NUM(pos);
4278 }
4279 return Qnil;
4280}
4281
4282static long
4283rb_str_byterindex(VALUE str, VALUE sub, long pos)
4284{
4285 long len, slen;
4286 char *sbeg, *s;
4287 rb_encoding *enc;
4288
4289 enc = rb_enc_check(str, sub);
4290 if (is_broken_string(sub)) return -1;
4291 len = RSTRING_LEN(str);
4292 slen = RSTRING_LEN(sub);
4293
4294 /* substring longer than string */
4295 if (len < slen) return -1;
4296 if (len - pos < slen) pos = len - slen;
4297 if (len == 0) return pos;
4298
4299 sbeg = RSTRING_PTR(str);
4300
4301 if (pos == 0) {
4302 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4303 return 0;
4304 else
4305 return -1;
4306 }
4307
4308 s = sbeg + pos;
4309 return str_rindex(str, sub, s, enc);
4310}
4311
4312
4313/*
4314 * call-seq:
4315 * byterindex(substring, offset = self.bytesize) -> integer or nil
4316 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4317 *
4318 * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
4319 * or +nil+ if none found:
4320 *
4321 * 'foo'.byterindex('f') # => 0
4322 * 'foo'.byterindex('o') # => 2
4323 * 'foo'.byterindex('oo') # => 1
4324 * 'foo'.byterindex('ooo') # => nil
4325 *
4326 * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+,
4327 * or +nil+ if none found:
4328 *
4329 * 'foo'.byterindex(/f/) # => 0
4330 * 'foo'.byterindex(/o/) # => 2
4331 * 'foo'.byterindex(/oo/) # => 1
4332 * 'foo'.byterindex(/ooo/) # => nil
4333 *
4334 * The _last_ match means starting at the possible last position, not
4335 * the last of longest matches.
4336 *
4337 * 'foo'.byterindex(/o+/) # => 2
4338 * $~ #=> #<MatchData "o">
4339 *
4340 * To get the last longest match, needs to combine with negative
4341 * lookbehind.
4342 *
4343 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4344 * $~ #=> #<MatchData "oo">
4345 *
4346 * Or String#byteindex with negative lookforward.
4347 *
4348 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4349 * $~ #=> #<MatchData "oo">
4350 *
4351 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4352 * string to _end_ the search:
4353 *
4354 * 'foo'.byterindex('o', 0) # => nil
4355 * 'foo'.byterindex('o', 1) # => 1
4356 * 'foo'.byterindex('o', 2) # => 2
4357 * 'foo'.byterindex('o', 3) # => 2
4358 *
4359 * If +offset+ is a negative \Integer, the maximum starting position in the
4360 * string to _end_ the search is the sum of the string's length and +offset+:
4361 *
4362 * 'foo'.byterindex('o', -1) # => 2
4363 * 'foo'.byterindex('o', -2) # => 1
4364 * 'foo'.byterindex('o', -3) # => nil
4365 * 'foo'.byterindex('o', -4) # => nil
4366 *
4367 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4368 * raised.
4369 *
4370 * Related: String#byteindex.
4371 */
4372
4373static VALUE
4374rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4375{
4376 VALUE sub;
4377 VALUE vpos;
4378 long pos, len = RSTRING_LEN(str);
4379
4380 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4381 pos = NUM2LONG(vpos);
4382 if (pos < 0) {
4383 pos += len;
4384 if (pos < 0) {
4385 if (RB_TYPE_P(sub, T_REGEXP)) {
4387 }
4388 return Qnil;
4389 }
4390 }
4391 if (pos > len) pos = len;
4392 }
4393 else {
4394 pos = len;
4395 }
4396
4397 if (!str_check_byte_pos(str, pos)) {
4399 "offset %ld does not land on character boundary", pos);
4400 }
4401
4402 if (RB_TYPE_P(sub, T_REGEXP)) {
4403 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4404 VALUE match = rb_backref_get();
4405 struct re_registers *regs = RMATCH_REGS(match);
4406 pos = BEG(0);
4407 return LONG2NUM(pos);
4408 }
4409 }
4410 else {
4411 StringValue(sub);
4412 pos = rb_str_byterindex(str, sub, pos);
4413 if (pos >= 0) return LONG2NUM(pos);
4414 }
4415 return Qnil;
4416}
4417
4418/*
4419 * call-seq:
4420 * string =~ regexp -> integer or nil
4421 * string =~ object -> integer or nil
4422 *
4423 * Returns the \Integer index of the first substring that matches
4424 * the given +regexp+, or +nil+ if no match found:
4425 *
4426 * 'foo' =~ /f/ # => 0
4427 * 'foo' =~ /o/ # => 1
4428 * 'foo' =~ /x/ # => nil
4429 *
4430 * Note: also updates Regexp@Special+global+variables.
4431 *
4432 * If the given +object+ is not a \Regexp, returns the value
4433 * returned by <tt>object =~ self</tt>.
4434 *
4435 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4436 * (see Regexp#=~):
4437 *
4438 * number= nil
4439 * "no. 9" =~ /(?<number>\d+)/
4440 * number # => nil (not assigned)
4441 * /(?<number>\d+)/ =~ "no. 9"
4442 * number #=> "9"
4443 *
4444 */
4445
4446static VALUE
4447rb_str_match(VALUE x, VALUE y)
4448{
4449 switch (OBJ_BUILTIN_TYPE(y)) {
4450 case T_STRING:
4451 rb_raise(rb_eTypeError, "type mismatch: String given");
4452
4453 case T_REGEXP:
4454 return rb_reg_match(y, x);
4455
4456 default:
4457 return rb_funcall(y, idEqTilde, 1, x);
4458 }
4459}
4460
4461
4462static VALUE get_pat(VALUE);
4463
4464
4465/*
4466 * call-seq:
4467 * match(pattern, offset = 0) -> matchdata or nil
4468 * match(pattern, offset = 0) {|matchdata| ... } -> object
4469 *
4470 * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+.
4471 *
4472 * Note: also updates Regexp@Special+global+variables.
4473 *
4474 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4475 * regexp = Regexp.new(pattern)
4476 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4477 * (see Regexp#match):
4478 * matchdata = <tt>regexp.match(self)
4479 *
4480 * With no block given, returns the computed +matchdata+:
4481 *
4482 * 'foo'.match('f') # => #<MatchData "f">
4483 * 'foo'.match('o') # => #<MatchData "o">
4484 * 'foo'.match('x') # => nil
4485 *
4486 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4487 *
4488 * 'foo'.match('f', 1) # => nil
4489 * 'foo'.match('o', 1) # => #<MatchData "o">
4490 *
4491 * With a block given, calls the block with the computed +matchdata+
4492 * and returns the block's return value:
4493 *
4494 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4495 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4496 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4497 *
4498 */
4499
4500static VALUE
4501rb_str_match_m(int argc, VALUE *argv, VALUE str)
4502{
4503 VALUE re, result;
4504 if (argc < 1)
4505 rb_check_arity(argc, 1, 2);
4506 re = argv[0];
4507 argv[0] = str;
4508 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4509 if (!NIL_P(result) && rb_block_given_p()) {
4510 return rb_yield(result);
4511 }
4512 return result;
4513}
4514
4515/*
4516 * call-seq:
4517 * match?(pattern, offset = 0) -> true or false
4518 *
4519 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4520 *
4521 * Note: does not update Regexp@Special+global+variables.
4522 *
4523 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4524 * regexp = Regexp.new(pattern)
4525 *
4526 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \MatchData object,
4527 * +false+ otherwise:
4528 *
4529 * 'foo'.match?(/o/) # => true
4530 * 'foo'.match?('o') # => true
4531 * 'foo'.match?(/x/) # => false
4532 *
4533 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4534 * 'foo'.match?('f', 1) # => false
4535 * 'foo'.match?('o', 1) # => true
4536 *
4537 */
4538
4539static VALUE
4540rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4541{
4542 VALUE re;
4543 rb_check_arity(argc, 1, 2);
4544 re = get_pat(argv[0]);
4545 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4546}
4547
4548enum neighbor_char {
4549 NEIGHBOR_NOT_CHAR,
4550 NEIGHBOR_FOUND,
4551 NEIGHBOR_WRAPPED
4552};
4553
4554static enum neighbor_char
4555enc_succ_char(char *p, long len, rb_encoding *enc)
4556{
4557 long i;
4558 int l;
4559
4560 if (rb_enc_mbminlen(enc) > 1) {
4561 /* wchar, trivial case */
4562 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4563 if (!MBCLEN_CHARFOUND_P(r)) {
4564 return NEIGHBOR_NOT_CHAR;
4565 }
4566 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4567 l = rb_enc_code_to_mbclen(c, enc);
4568 if (!l) return NEIGHBOR_NOT_CHAR;
4569 if (l != len) return NEIGHBOR_WRAPPED;
4570 rb_enc_mbcput(c, p, enc);
4571 r = rb_enc_precise_mbclen(p, p + len, enc);
4572 if (!MBCLEN_CHARFOUND_P(r)) {
4573 return NEIGHBOR_NOT_CHAR;
4574 }
4575 return NEIGHBOR_FOUND;
4576 }
4577 while (1) {
4578 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4579 p[i] = '\0';
4580 if (i < 0)
4581 return NEIGHBOR_WRAPPED;
4582 ++((unsigned char*)p)[i];
4583 l = rb_enc_precise_mbclen(p, p+len, enc);
4584 if (MBCLEN_CHARFOUND_P(l)) {
4585 l = MBCLEN_CHARFOUND_LEN(l);
4586 if (l == len) {
4587 return NEIGHBOR_FOUND;
4588 }
4589 else {
4590 memset(p+l, 0xff, len-l);
4591 }
4592 }
4593 if (MBCLEN_INVALID_P(l) && i < len-1) {
4594 long len2;
4595 int l2;
4596 for (len2 = len-1; 0 < len2; len2--) {
4597 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4598 if (!MBCLEN_INVALID_P(l2))
4599 break;
4600 }
4601 memset(p+len2+1, 0xff, len-(len2+1));
4602 }
4603 }
4604}
4605
4606static enum neighbor_char
4607enc_pred_char(char *p, long len, rb_encoding *enc)
4608{
4609 long i;
4610 int l;
4611 if (rb_enc_mbminlen(enc) > 1) {
4612 /* wchar, trivial case */
4613 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4614 if (!MBCLEN_CHARFOUND_P(r)) {
4615 return NEIGHBOR_NOT_CHAR;
4616 }
4617 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4618 if (!c) return NEIGHBOR_NOT_CHAR;
4619 --c;
4620 l = rb_enc_code_to_mbclen(c, enc);
4621 if (!l) return NEIGHBOR_NOT_CHAR;
4622 if (l != len) return NEIGHBOR_WRAPPED;
4623 rb_enc_mbcput(c, p, enc);
4624 r = rb_enc_precise_mbclen(p, p + len, enc);
4625 if (!MBCLEN_CHARFOUND_P(r)) {
4626 return NEIGHBOR_NOT_CHAR;
4627 }
4628 return NEIGHBOR_FOUND;
4629 }
4630 while (1) {
4631 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4632 p[i] = '\xff';
4633 if (i < 0)
4634 return NEIGHBOR_WRAPPED;
4635 --((unsigned char*)p)[i];
4636 l = rb_enc_precise_mbclen(p, p+len, enc);
4637 if (MBCLEN_CHARFOUND_P(l)) {
4638 l = MBCLEN_CHARFOUND_LEN(l);
4639 if (l == len) {
4640 return NEIGHBOR_FOUND;
4641 }
4642 else {
4643 memset(p+l, 0, len-l);
4644 }
4645 }
4646 if (MBCLEN_INVALID_P(l) && i < len-1) {
4647 long len2;
4648 int l2;
4649 for (len2 = len-1; 0 < len2; len2--) {
4650 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4651 if (!MBCLEN_INVALID_P(l2))
4652 break;
4653 }
4654 memset(p+len2+1, 0, len-(len2+1));
4655 }
4656 }
4657}
4658
4659/*
4660 overwrite +p+ by succeeding letter in +enc+ and returns
4661 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4662 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4663 assuming each ranges are successive, and mbclen
4664 never change in each ranges.
4665 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4666 character.
4667 */
4668static enum neighbor_char
4669enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4670{
4671 enum neighbor_char ret;
4672 unsigned int c;
4673 int ctype;
4674 int range;
4675 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4676
4677 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4678 int try;
4679 const int max_gaps = 1;
4680
4681 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4682 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4683 ctype = ONIGENC_CTYPE_DIGIT;
4684 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4685 ctype = ONIGENC_CTYPE_ALPHA;
4686 else
4687 return NEIGHBOR_NOT_CHAR;
4688
4689 MEMCPY(save, p, char, len);
4690 for (try = 0; try <= max_gaps; ++try) {
4691 ret = enc_succ_char(p, len, enc);
4692 if (ret == NEIGHBOR_FOUND) {
4693 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4694 if (rb_enc_isctype(c, ctype, enc))
4695 return NEIGHBOR_FOUND;
4696 }
4697 }
4698 MEMCPY(p, save, char, len);
4699 range = 1;
4700 while (1) {
4701 MEMCPY(save, p, char, len);
4702 ret = enc_pred_char(p, len, enc);
4703 if (ret == NEIGHBOR_FOUND) {
4704 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4705 if (!rb_enc_isctype(c, ctype, enc)) {
4706 MEMCPY(p, save, char, len);
4707 break;
4708 }
4709 }
4710 else {
4711 MEMCPY(p, save, char, len);
4712 break;
4713 }
4714 range++;
4715 }
4716 if (range == 1) {
4717 return NEIGHBOR_NOT_CHAR;
4718 }
4719
4720 if (ctype != ONIGENC_CTYPE_DIGIT) {
4721 MEMCPY(carry, p, char, len);
4722 return NEIGHBOR_WRAPPED;
4723 }
4724
4725 MEMCPY(carry, p, char, len);
4726 enc_succ_char(carry, len, enc);
4727 return NEIGHBOR_WRAPPED;
4728}
4729
4730
4731static VALUE str_succ(VALUE str);
4732
4733/*
4734 * call-seq:
4735 * succ -> new_str
4736 *
4737 * Returns the successor to +self+. The successor is calculated by
4738 * incrementing characters.
4739 *
4740 * The first character to be incremented is the rightmost alphanumeric:
4741 * or, if no alphanumerics, the rightmost character:
4742 *
4743 * 'THX1138'.succ # => "THX1139"
4744 * '<<koala>>'.succ # => "<<koalb>>"
4745 * '***'.succ # => '**+'
4746 *
4747 * The successor to a digit is another digit, "carrying" to the next-left
4748 * character for a "rollover" from 9 to 0, and prepending another digit
4749 * if necessary:
4750 *
4751 * '00'.succ # => "01"
4752 * '09'.succ # => "10"
4753 * '99'.succ # => "100"
4754 *
4755 * The successor to a letter is another letter of the same case,
4756 * carrying to the next-left character for a rollover,
4757 * and prepending another same-case letter if necessary:
4758 *
4759 * 'aa'.succ # => "ab"
4760 * 'az'.succ # => "ba"
4761 * 'zz'.succ # => "aaa"
4762 * 'AA'.succ # => "AB"
4763 * 'AZ'.succ # => "BA"
4764 * 'ZZ'.succ # => "AAA"
4765 *
4766 * The successor to a non-alphanumeric character is the next character
4767 * in the underlying character set's collating sequence,
4768 * carrying to the next-left character for a rollover,
4769 * and prepending another character if necessary:
4770 *
4771 * s = 0.chr * 3
4772 * s # => "\x00\x00\x00"
4773 * s.succ # => "\x00\x00\x01"
4774 * s = 255.chr * 3
4775 * s # => "\xFF\xFF\xFF"
4776 * s.succ # => "\x01\x00\x00\x00"
4777 *
4778 * Carrying can occur between and among mixtures of alphanumeric characters:
4779 *
4780 * s = 'zz99zz99'
4781 * s.succ # => "aaa00aa00"
4782 * s = '99zz99zz'
4783 * s.succ # => "100aa00aa"
4784 *
4785 * The successor to an empty \String is a new empty \String:
4786 *
4787 * ''.succ # => ""
4788 *
4789 * String#next is an alias for String#succ.
4790 */
4791
4792VALUE
4794{
4795 VALUE str;
4796 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4797 rb_enc_cr_str_copy_for_substr(str, orig);
4798 return str_succ(str);
4799}
4800
4801static VALUE
4802str_succ(VALUE str)
4803{
4804 rb_encoding *enc;
4805 char *sbeg, *s, *e, *last_alnum = 0;
4806 int found_alnum = 0;
4807 long l, slen;
4808 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4809 long carry_pos = 0, carry_len = 1;
4810 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4811
4812 slen = RSTRING_LEN(str);
4813 if (slen == 0) return str;
4814
4815 enc = STR_ENC_GET(str);
4816 sbeg = RSTRING_PTR(str);
4817 s = e = sbeg + slen;
4818
4819 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4820 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4821 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4822 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4823 break;
4824 }
4825 }
4826 l = rb_enc_precise_mbclen(s, e, enc);
4827 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4828 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4829 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4830 switch (neighbor) {
4831 case NEIGHBOR_NOT_CHAR:
4832 continue;
4833 case NEIGHBOR_FOUND:
4834 return str;
4835 case NEIGHBOR_WRAPPED:
4836 last_alnum = s;
4837 break;
4838 }
4839 found_alnum = 1;
4840 carry_pos = s - sbeg;
4841 carry_len = l;
4842 }
4843 if (!found_alnum) { /* str contains no alnum */
4844 s = e;
4845 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4846 enum neighbor_char neighbor;
4847 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4848 l = rb_enc_precise_mbclen(s, e, enc);
4849 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4850 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4851 MEMCPY(tmp, s, char, l);
4852 neighbor = enc_succ_char(tmp, l, enc);
4853 switch (neighbor) {
4854 case NEIGHBOR_FOUND:
4855 MEMCPY(s, tmp, char, l);
4856 return str;
4857 break;
4858 case NEIGHBOR_WRAPPED:
4859 MEMCPY(s, tmp, char, l);
4860 break;
4861 case NEIGHBOR_NOT_CHAR:
4862 break;
4863 }
4864 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4865 /* wrapped to \0...\0. search next valid char. */
4866 enc_succ_char(s, l, enc);
4867 }
4868 if (!rb_enc_asciicompat(enc)) {
4869 MEMCPY(carry, s, char, l);
4870 carry_len = l;
4871 }
4872 carry_pos = s - sbeg;
4873 }
4875 }
4876 RESIZE_CAPA(str, slen + carry_len);
4877 sbeg = RSTRING_PTR(str);
4878 s = sbeg + carry_pos;
4879 memmove(s + carry_len, s, slen - carry_pos);
4880 memmove(s, carry, carry_len);
4881 slen += carry_len;
4882 STR_SET_LEN(str, slen);
4883 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4885 return str;
4886}
4887
4888
4889/*
4890 * call-seq:
4891 * succ! -> self
4892 *
4893 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4894 *
4895 * String#next! is an alias for String#succ!.
4896 */
4897
4898static VALUE
4899rb_str_succ_bang(VALUE str)
4900{
4901 rb_str_modify(str);
4902 str_succ(str);
4903 return str;
4904}
4905
4906static int
4907all_digits_p(const char *s, long len)
4908{
4909 while (len-- > 0) {
4910 if (!ISDIGIT(*s)) return 0;
4911 s++;
4912 }
4913 return 1;
4914}
4915
4916static int
4917str_upto_i(VALUE str, VALUE arg)
4918{
4919 rb_yield(str);
4920 return 0;
4921}
4922
4923/*
4924 * call-seq:
4925 * upto(other_string, exclusive = false) {|string| ... } -> self
4926 * upto(other_string, exclusive = false) -> new_enumerator
4927 *
4928 * With a block given, calls the block with each \String value
4929 * returned by successive calls to String#succ;
4930 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4931 * the sequence terminates when value +other_string+ is reached;
4932 * returns +self+:
4933 *
4934 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4935 * Output:
4936 *
4937 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4938 *
4939 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4940 *
4941 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4942 *
4943 * Output:
4944 *
4945 * a8 a9 b0 b1 b2 b3 b4 b5
4946 *
4947 * If +other_string+ would not be reached, does not call the block:
4948 *
4949 * '25'.upto('5') {|s| fail s }
4950 * 'aa'.upto('a') {|s| fail s }
4951 *
4952 * With no block given, returns a new \Enumerator:
4953 *
4954 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4955 *
4956 */
4957
4958static VALUE
4959rb_str_upto(int argc, VALUE *argv, VALUE beg)
4960{
4961 VALUE end, exclusive;
4962
4963 rb_scan_args(argc, argv, "11", &end, &exclusive);
4964 RETURN_ENUMERATOR(beg, argc, argv);
4965 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4966}
4967
4968VALUE
4969rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4970{
4971 VALUE current, after_end;
4972 ID succ;
4973 int n, ascii;
4974 rb_encoding *enc;
4975
4976 CONST_ID(succ, "succ");
4977 StringValue(end);
4978 enc = rb_enc_check(beg, end);
4979 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4980 /* single character */
4981 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4982 char c = RSTRING_PTR(beg)[0];
4983 char e = RSTRING_PTR(end)[0];
4984
4985 if (c > e || (excl && c == e)) return beg;
4986 for (;;) {
4987 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4988 if (!excl && c == e) break;
4989 c++;
4990 if (excl && c == e) break;
4991 }
4992 return beg;
4993 }
4994 /* both edges are all digits */
4995 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4996 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4997 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4998 VALUE b, e;
4999 int width;
5000
5001 width = RSTRING_LENINT(beg);
5002 b = rb_str_to_inum(beg, 10, FALSE);
5003 e = rb_str_to_inum(end, 10, FALSE);
5004 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5005 long bi = FIX2LONG(b);
5006 long ei = FIX2LONG(e);
5007 rb_encoding *usascii = rb_usascii_encoding();
5008
5009 while (bi <= ei) {
5010 if (excl && bi == ei) break;
5011 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5012 bi++;
5013 }
5014 }
5015 else {
5016 ID op = excl ? '<' : idLE;
5017 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5018
5019 args[0] = INT2FIX(width);
5020 while (rb_funcall(b, op, 1, e)) {
5021 args[1] = b;
5022 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5023 b = rb_funcallv(b, succ, 0, 0);
5024 }
5025 }
5026 return beg;
5027 }
5028 /* normal case */
5029 n = rb_str_cmp(beg, end);
5030 if (n > 0 || (excl && n == 0)) return beg;
5031
5032 after_end = rb_funcallv(end, succ, 0, 0);
5033 current = str_duplicate(rb_cString, beg);
5034 while (!rb_str_equal(current, after_end)) {
5035 VALUE next = Qnil;
5036 if (excl || !rb_str_equal(current, end))
5037 next = rb_funcallv(current, succ, 0, 0);
5038 if ((*each)(current, arg)) break;
5039 if (NIL_P(next)) break;
5040 current = next;
5041 StringValue(current);
5042 if (excl && rb_str_equal(current, end)) break;
5043 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5044 break;
5045 }
5046
5047 return beg;
5048}
5049
5050VALUE
5051rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5052{
5053 VALUE current;
5054 ID succ;
5055
5056 CONST_ID(succ, "succ");
5057 /* both edges are all digits */
5058 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5059 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5060 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5061 int width = RSTRING_LENINT(beg);
5062 b = rb_str_to_inum(beg, 10, FALSE);
5063 if (FIXNUM_P(b)) {
5064 long bi = FIX2LONG(b);
5065 rb_encoding *usascii = rb_usascii_encoding();
5066
5067 while (FIXABLE(bi)) {
5068 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5069 bi++;
5070 }
5071 b = LONG2NUM(bi);
5072 }
5073 args[0] = INT2FIX(width);
5074 while (1) {
5075 args[1] = b;
5076 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5077 b = rb_funcallv(b, succ, 0, 0);
5078 }
5079 }
5080 /* normal case */
5081 current = str_duplicate(rb_cString, beg);
5082 while (1) {
5083 VALUE next = rb_funcallv(current, succ, 0, 0);
5084 if ((*each)(current, arg)) break;
5085 current = next;
5086 StringValue(current);
5087 if (RSTRING_LEN(current) == 0)
5088 break;
5089 }
5090
5091 return beg;
5092}
5093
5094static int
5095include_range_i(VALUE str, VALUE arg)
5096{
5097 VALUE *argp = (VALUE *)arg;
5098 if (!rb_equal(str, *argp)) return 0;
5099 *argp = Qnil;
5100 return 1;
5101}
5102
5103VALUE
5104rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5105{
5106 beg = rb_str_new_frozen(beg);
5107 StringValue(end);
5108 end = rb_str_new_frozen(end);
5109 if (NIL_P(val)) return Qfalse;
5110 val = rb_check_string_type(val);
5111 if (NIL_P(val)) return Qfalse;
5112 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5113 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5114 rb_enc_asciicompat(STR_ENC_GET(val))) {
5115 const char *bp = RSTRING_PTR(beg);
5116 const char *ep = RSTRING_PTR(end);
5117 const char *vp = RSTRING_PTR(val);
5118 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5119 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5120 return Qfalse;
5121 else {
5122 char b = *bp;
5123 char e = *ep;
5124 char v = *vp;
5125
5126 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5127 if (b <= v && v < e) return Qtrue;
5128 return RBOOL(!RTEST(exclusive) && v == e);
5129 }
5130 }
5131 }
5132#if 0
5133 /* both edges are all digits */
5134 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5135 all_digits_p(bp, RSTRING_LEN(beg)) &&
5136 all_digits_p(ep, RSTRING_LEN(end))) {
5137 /* TODO */
5138 }
5139#endif
5140 }
5141 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5142
5143 return RBOOL(NIL_P(val));
5144}
5145
5146static VALUE
5147rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5148{
5149 if (rb_reg_search(re, str, 0, 0) >= 0) {
5150 VALUE match = rb_backref_get();
5151 int nth = rb_reg_backref_number(match, backref);
5152 return rb_reg_nth_match(nth, match);
5153 }
5154 return Qnil;
5155}
5156
5157static VALUE
5158rb_str_aref(VALUE str, VALUE indx)
5159{
5160 long idx;
5161
5162 if (FIXNUM_P(indx)) {
5163 idx = FIX2LONG(indx);
5164 }
5165 else if (RB_TYPE_P(indx, T_REGEXP)) {
5166 return rb_str_subpat(str, indx, INT2FIX(0));
5167 }
5168 else if (RB_TYPE_P(indx, T_STRING)) {
5169 if (rb_str_index(str, indx, 0) != -1)
5170 return str_duplicate(rb_cString, indx);
5171 return Qnil;
5172 }
5173 else {
5174 /* check if indx is Range */
5175 long beg, len = str_strlen(str, NULL);
5176 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5177 case Qfalse:
5178 break;
5179 case Qnil:
5180 return Qnil;
5181 default:
5182 return rb_str_substr(str, beg, len);
5183 }
5184 idx = NUM2LONG(indx);
5185 }
5186
5187 return str_substr(str, idx, 1, FALSE);
5188}
5189
5190
5191/*
5192 * call-seq:
5193 * string[index] -> new_string or nil
5194 * string[start, length] -> new_string or nil
5195 * string[range] -> new_string or nil
5196 * string[regexp, capture = 0] -> new_string or nil
5197 * string[substring] -> new_string or nil
5198 *
5199 * Returns the substring of +self+ specified by the arguments.
5200 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5201 *
5202 *
5203 */
5204
5205static VALUE
5206rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5207{
5208 if (argc == 2) {
5209 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5210 return rb_str_subpat(str, argv[0], argv[1]);
5211 }
5212 else {
5213 long beg = NUM2LONG(argv[0]);
5214 long len = NUM2LONG(argv[1]);
5215 return rb_str_substr(str, beg, len);
5216 }
5217 }
5218 rb_check_arity(argc, 1, 2);
5219 return rb_str_aref(str, argv[0]);
5220}
5221
5222VALUE
5224{
5225 char *ptr = RSTRING_PTR(str);
5226 long olen = RSTRING_LEN(str), nlen;
5227
5228 str_modifiable(str);
5229 if (len > olen) len = olen;
5230 nlen = olen - len;
5231 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5232 char *oldptr = ptr;
5233 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5234 STR_SET_EMBED(str);
5235 STR_SET_EMBED_LEN(str, nlen);
5236 ptr = RSTRING(str)->as.embed.ary;
5237 memmove(ptr, oldptr + len, nlen);
5238 if (fl == STR_NOEMBED) xfree(oldptr);
5239 }
5240 else {
5241 if (!STR_SHARED_P(str)) {
5242 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5243 rb_enc_cr_str_exact_copy(shared, str);
5244 OBJ_FREEZE(shared);
5245 }
5246 ptr = RSTRING(str)->as.heap.ptr += len;
5247 RSTRING(str)->as.heap.len = nlen;
5248 }
5249 ptr[nlen] = 0;
5251 return str;
5252}
5253
5254static void
5255rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5256{
5257 char *sptr;
5258 long slen, vlen = RSTRING_LEN(val);
5259 int cr;
5260
5261 if (beg == 0 && vlen == 0) {
5262 rb_str_drop_bytes(str, len);
5263 return;
5264 }
5265
5266 str_modify_keep_cr(str);
5267 RSTRING_GETMEM(str, sptr, slen);
5268 if (len < vlen) {
5269 /* expand string */
5270 RESIZE_CAPA(str, slen + vlen - len);
5271 sptr = RSTRING_PTR(str);
5272 }
5273
5275 cr = rb_enc_str_coderange(val);
5276 else
5278
5279 if (vlen != len) {
5280 memmove(sptr + beg + vlen,
5281 sptr + beg + len,
5282 slen - (beg + len));
5283 }
5284 if (vlen < beg && len < 0) {
5285 MEMZERO(sptr + slen, char, -len);
5286 }
5287 if (vlen > 0) {
5288 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5289 }
5290 slen += vlen - len;
5291 STR_SET_LEN(str, slen);
5292 TERM_FILL(&sptr[slen], TERM_LEN(str));
5293 ENC_CODERANGE_SET(str, cr);
5294}
5295
5296void
5297rb_str_update(VALUE str, long beg, long len, VALUE val)
5298{
5299 long slen;
5300 char *p, *e;
5301 rb_encoding *enc;
5302 int singlebyte = single_byte_optimizable(str);
5303 int cr;
5304
5305 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5306
5307 StringValue(val);
5308 enc = rb_enc_check(str, val);
5309 slen = str_strlen(str, enc); /* rb_enc_check */
5310
5311 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5312 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5313 }
5314 if (beg < 0) {
5315 beg += slen;
5316 }
5317 assert(beg >= 0);
5318 assert(beg <= slen);
5319 if (len > slen - beg) {
5320 len = slen - beg;
5321 }
5322 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5323 if (!p) p = RSTRING_END(str);
5324 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5325 if (!e) e = RSTRING_END(str);
5326 /* error check */
5327 beg = p - RSTRING_PTR(str); /* physical position */
5328 len = e - p; /* physical length */
5329 rb_str_splice_0(str, beg, len, val);
5330 rb_enc_associate(str, enc);
5332 if (cr != ENC_CODERANGE_BROKEN)
5333 ENC_CODERANGE_SET(str, cr);
5334}
5335
5336#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5337
5338static void
5339rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5340{
5341 int nth;
5342 VALUE match;
5343 long start, end, len;
5344 rb_encoding *enc;
5345 struct re_registers *regs;
5346
5347 if (rb_reg_search(re, str, 0, 0) < 0) {
5348 rb_raise(rb_eIndexError, "regexp not matched");
5349 }
5350 match = rb_backref_get();
5351 nth = rb_reg_backref_number(match, backref);
5352 regs = RMATCH_REGS(match);
5353 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5354 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5355 }
5356 if (nth < 0) {
5357 nth += regs->num_regs;
5358 }
5359
5360 start = BEG(nth);
5361 if (start == -1) {
5362 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5363 }
5364 end = END(nth);
5365 len = end - start;
5366 StringValue(val);
5367 enc = rb_enc_check_str(str, val);
5368 rb_str_splice_0(str, start, len, val);
5369 rb_enc_associate(str, enc);
5370}
5371
5372static VALUE
5373rb_str_aset(VALUE str, VALUE indx, VALUE val)
5374{
5375 long idx, beg;
5376
5377 switch (TYPE(indx)) {
5378 case T_REGEXP:
5379 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5380 return val;
5381
5382 case T_STRING:
5383 beg = rb_str_index(str, indx, 0);
5384 if (beg < 0) {
5385 rb_raise(rb_eIndexError, "string not matched");
5386 }
5387 beg = rb_str_sublen(str, beg);
5388 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5389 return val;
5390
5391 default:
5392 /* check if indx is Range */
5393 {
5394 long beg, len;
5395 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5396 rb_str_splice(str, beg, len, val);
5397 return val;
5398 }
5399 }
5400 /* FALLTHROUGH */
5401
5402 case T_FIXNUM:
5403 idx = NUM2LONG(indx);
5404 rb_str_splice(str, idx, 1, val);
5405 return val;
5406 }
5407}
5408
5409/*
5410 * call-seq:
5411 * string[index] = new_string
5412 * string[start, length] = new_string
5413 * string[range] = new_string
5414 * string[regexp, capture = 0] = new_string
5415 * string[substring] = new_string
5416 *
5417 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5418 * See {String Slices}[rdoc-ref:String@String+Slices].
5419 *
5420 * A few examples:
5421 *
5422 * s = 'foo'
5423 * s[2] = 'rtune' # => "rtune"
5424 * s # => "fortune"
5425 * s[1, 5] = 'init' # => "init"
5426 * s # => "finite"
5427 * s[3..4] = 'al' # => "al"
5428 * s # => "finale"
5429 * s[/e$/] = 'ly' # => "ly"
5430 * s # => "finally"
5431 * s['lly'] = 'ncial' # => "ncial"
5432 * s # => "financial"
5433 *
5434 * String#slice is an alias for String#[].
5435 *
5436 */
5437
5438static VALUE
5439rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5440{
5441 if (argc == 3) {
5442 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5443 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5444 }
5445 else {
5446 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5447 }
5448 return argv[2];
5449 }
5450 rb_check_arity(argc, 2, 3);
5451 return rb_str_aset(str, argv[0], argv[1]);
5452}
5453
5454/*
5455 * call-seq:
5456 * insert(index, other_string) -> self
5457 *
5458 * Inserts the given +other_string+ into +self+; returns +self+.
5459 *
5460 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5461 *
5462 * 'foo'.insert(1, 'bar') # => "fbaroo"
5463 *
5464 * If the \Integer +index+ is negative, counts backward from the end of +self+
5465 * and inserts +other_string+ at offset <tt>index+1</tt>
5466 * (that is, _after_ <tt>self[index]</tt>):
5467 *
5468 * 'foo'.insert(-2, 'bar') # => "fobaro"
5469 *
5470 */
5471
5472static VALUE
5473rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5474{
5475 long pos = NUM2LONG(idx);
5476
5477 if (pos == -1) {
5478 return rb_str_append(str, str2);
5479 }
5480 else if (pos < 0) {
5481 pos++;
5482 }
5483 rb_str_splice(str, pos, 0, str2);
5484 return str;
5485}
5486
5487
5488/*
5489 * call-seq:
5490 * slice!(index) -> new_string or nil
5491 * slice!(start, length) -> new_string or nil
5492 * slice!(range) -> new_string or nil
5493 * slice!(regexp, capture = 0) -> new_string or nil
5494 * slice!(substring) -> new_string or nil
5495 *
5496 * Removes and returns the substring of +self+ specified by the arguments.
5497 * See {String Slices}[rdoc-ref:String@String+Slices].
5498 *
5499 * A few examples:
5500 *
5501 * string = "This is a string"
5502 * string.slice!(2) #=> "i"
5503 * string.slice!(3..6) #=> " is "
5504 * string.slice!(/s.*t/) #=> "sa st"
5505 * string.slice!("r") #=> "r"
5506 * string #=> "Thing"
5507 *
5508 */
5509
5510static VALUE
5511rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5512{
5513 VALUE result = Qnil;
5514 VALUE indx;
5515 long beg, len = 1;
5516 char *p;
5517
5518 rb_check_arity(argc, 1, 2);
5519 str_modify_keep_cr(str);
5520 indx = argv[0];
5521 if (RB_TYPE_P(indx, T_REGEXP)) {
5522 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5523 VALUE match = rb_backref_get();
5524 struct re_registers *regs = RMATCH_REGS(match);
5525 int nth = 0;
5526 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5527 if ((nth += regs->num_regs) <= 0) return Qnil;
5528 }
5529 else if (nth >= regs->num_regs) return Qnil;
5530 beg = BEG(nth);
5531 len = END(nth) - beg;
5532 goto subseq;
5533 }
5534 else if (argc == 2) {
5535 beg = NUM2LONG(indx);
5536 len = NUM2LONG(argv[1]);
5537 goto num_index;
5538 }
5539 else if (FIXNUM_P(indx)) {
5540 beg = FIX2LONG(indx);
5541 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5542 if (!len) return Qnil;
5543 beg = p - RSTRING_PTR(str);
5544 goto subseq;
5545 }
5546 else if (RB_TYPE_P(indx, T_STRING)) {
5547 beg = rb_str_index(str, indx, 0);
5548 if (beg == -1) return Qnil;
5549 len = RSTRING_LEN(indx);
5550 result = str_duplicate(rb_cString, indx);
5551 goto squash;
5552 }
5553 else {
5554 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5555 case Qnil:
5556 return Qnil;
5557 case Qfalse:
5558 beg = NUM2LONG(indx);
5559 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5560 if (!len) return Qnil;
5561 beg = p - RSTRING_PTR(str);
5562 goto subseq;
5563 default:
5564 goto num_index;
5565 }
5566 }
5567
5568 num_index:
5569 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5570 beg = p - RSTRING_PTR(str);
5571
5572 subseq:
5573 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5574 rb_enc_cr_str_copy_for_substr(result, str);
5575
5576 squash:
5577 if (len > 0) {
5578 if (beg == 0) {
5579 rb_str_drop_bytes(str, len);
5580 }
5581 else {
5582 char *sptr = RSTRING_PTR(str);
5583 long slen = RSTRING_LEN(str);
5584 if (beg + len > slen) /* pathological check */
5585 len = slen - beg;
5586 memmove(sptr + beg,
5587 sptr + beg + len,
5588 slen - (beg + len));
5589 slen -= len;
5590 STR_SET_LEN(str, slen);
5591 TERM_FILL(&sptr[slen], TERM_LEN(str));
5592 }
5593 }
5594 return result;
5595}
5596
5597static VALUE
5598get_pat(VALUE pat)
5599{
5600 VALUE val;
5601
5602 switch (OBJ_BUILTIN_TYPE(pat)) {
5603 case T_REGEXP:
5604 return pat;
5605
5606 case T_STRING:
5607 break;
5608
5609 default:
5610 val = rb_check_string_type(pat);
5611 if (NIL_P(val)) {
5612 Check_Type(pat, T_REGEXP);
5613 }
5614 pat = val;
5615 }
5616
5617 return rb_reg_regcomp(pat);
5618}
5619
5620static VALUE
5621get_pat_quoted(VALUE pat, int check)
5622{
5623 VALUE val;
5624
5625 switch (OBJ_BUILTIN_TYPE(pat)) {
5626 case T_REGEXP:
5627 return pat;
5628
5629 case T_STRING:
5630 break;
5631
5632 default:
5633 val = rb_check_string_type(pat);
5634 if (NIL_P(val)) {
5635 Check_Type(pat, T_REGEXP);
5636 }
5637 pat = val;
5638 }
5639 if (check && is_broken_string(pat)) {
5640 rb_exc_raise(rb_reg_check_preprocess(pat));
5641 }
5642 return pat;
5643}
5644
5645static long
5646rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5647{
5648 if (BUILTIN_TYPE(pat) == T_STRING) {
5649 pos = rb_strseq_index(str, pat, pos, 1);
5650 if (set_backref_str) {
5651 if (pos >= 0) {
5652 str = rb_str_new_frozen_String(str);
5653 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5654 }
5655 else {
5657 }
5658 }
5659 return pos;
5660 }
5661 else {
5662 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5663 }
5664}
5665
5666
5667/*
5668 * call-seq:
5669 * sub!(pattern, replacement) -> self or nil
5670 * sub!(pattern) {|match| ... } -> self or nil
5671 *
5672 * Returns +self+ with only the first occurrence
5673 * (not all occurrences) of the given +pattern+ replaced.
5674 *
5675 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5676 *
5677 * Related: String#sub, String#gsub, String#gsub!.
5678 *
5679 */
5680
5681static VALUE
5682rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5683{
5684 VALUE pat, repl, hash = Qnil;
5685 int iter = 0;
5686 long plen;
5687 int min_arity = rb_block_given_p() ? 1 : 2;
5688 long beg;
5689
5690 rb_check_arity(argc, min_arity, 2);
5691 if (argc == 1) {
5692 iter = 1;
5693 }
5694 else {
5695 repl = argv[1];
5696 hash = rb_check_hash_type(argv[1]);
5697 if (NIL_P(hash)) {
5698 StringValue(repl);
5699 }
5700 }
5701
5702 pat = get_pat_quoted(argv[0], 1);
5703
5704 str_modifiable(str);
5705 beg = rb_pat_search(pat, str, 0, 1);
5706 if (beg >= 0) {
5707 rb_encoding *enc;
5708 int cr = ENC_CODERANGE(str);
5709 long beg0, end0;
5710 VALUE match, match0 = Qnil;
5711 struct re_registers *regs;
5712 char *p, *rp;
5713 long len, rlen;
5714
5715 match = rb_backref_get();
5716 regs = RMATCH_REGS(match);
5717 if (RB_TYPE_P(pat, T_STRING)) {
5718 beg0 = beg;
5719 end0 = beg0 + RSTRING_LEN(pat);
5720 match0 = pat;
5721 }
5722 else {
5723 beg0 = BEG(0);
5724 end0 = END(0);
5725 if (iter) match0 = rb_reg_nth_match(0, match);
5726 }
5727
5728 if (iter || !NIL_P(hash)) {
5729 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5730
5731 if (iter) {
5732 repl = rb_obj_as_string(rb_yield(match0));
5733 }
5734 else {
5735 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5736 repl = rb_obj_as_string(repl);
5737 }
5738 str_mod_check(str, p, len);
5739 rb_check_frozen(str);
5740 }
5741 else {
5742 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5743 }
5744
5745 enc = rb_enc_compatible(str, repl);
5746 if (!enc) {
5747 rb_encoding *str_enc = STR_ENC_GET(str);
5748 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5749 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5750 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5751 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5752 rb_enc_name(str_enc),
5753 rb_enc_name(STR_ENC_GET(repl)));
5754 }
5755 enc = STR_ENC_GET(repl);
5756 }
5757 rb_str_modify(str);
5758 rb_enc_associate(str, enc);
5760 int cr2 = ENC_CODERANGE(repl);
5761 if (cr2 == ENC_CODERANGE_BROKEN ||
5762 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5764 else
5765 cr = cr2;
5766 }
5767 plen = end0 - beg0;
5768 rlen = RSTRING_LEN(repl);
5769 len = RSTRING_LEN(str);
5770 if (rlen > plen) {
5771 RESIZE_CAPA(str, len + rlen - plen);
5772 }
5773 p = RSTRING_PTR(str);
5774 if (rlen != plen) {
5775 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5776 }
5777 rp = RSTRING_PTR(repl);
5778 memmove(p + beg0, rp, rlen);
5779 len += rlen - plen;
5780 STR_SET_LEN(str, len);
5781 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5782 ENC_CODERANGE_SET(str, cr);
5783
5784 return str;
5785 }
5786 return Qnil;
5787}
5788
5789
5790/*
5791 * call-seq:
5792 * sub(pattern, replacement) -> new_string
5793 * sub(pattern) {|match| ... } -> new_string
5794 *
5795 * Returns a copy of +self+ with only the first occurrence
5796 * (not all occurrences) of the given +pattern+ replaced.
5797 *
5798 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5799 *
5800 * Related: String#sub!, String#gsub, String#gsub!.
5801 *
5802 */
5803
5804static VALUE
5805rb_str_sub(int argc, VALUE *argv, VALUE str)
5806{
5807 str = str_duplicate(rb_cString, str);
5808 rb_str_sub_bang(argc, argv, str);
5809 return str;
5810}
5811
5812static VALUE
5813str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5814{
5815 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5816 struct re_registers *regs;
5817 long beg, beg0, end0;
5818 long offset, blen, slen, len, last;
5819 enum {STR, ITER, MAP} mode = STR;
5820 char *sp, *cp;
5821 int need_backref = -1;
5822 rb_encoding *str_enc;
5823
5824 switch (argc) {
5825 case 1:
5826 RETURN_ENUMERATOR(str, argc, argv);
5827 mode = ITER;
5828 break;
5829 case 2:
5830 repl = argv[1];
5831 hash = rb_check_hash_type(argv[1]);
5832 if (NIL_P(hash)) {
5833 StringValue(repl);
5834 }
5835 else {
5836 mode = MAP;
5837 }
5838 break;
5839 default:
5840 rb_error_arity(argc, 1, 2);
5841 }
5842
5843 pat = get_pat_quoted(argv[0], 1);
5844 beg = rb_pat_search(pat, str, 0, need_backref);
5845 if (beg < 0) {
5846 if (bang) return Qnil; /* no match, no substitution */
5847 return str_duplicate(rb_cString, str);
5848 }
5849
5850 offset = 0;
5851 blen = RSTRING_LEN(str) + 30; /* len + margin */
5852 dest = rb_str_buf_new(blen);
5853 sp = RSTRING_PTR(str);
5854 slen = RSTRING_LEN(str);
5855 cp = sp;
5856 str_enc = STR_ENC_GET(str);
5857 rb_enc_associate(dest, str_enc);
5858 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5859
5860 do {
5861 match = rb_backref_get();
5862 regs = RMATCH_REGS(match);
5863 if (RB_TYPE_P(pat, T_STRING)) {
5864 beg0 = beg;
5865 end0 = beg0 + RSTRING_LEN(pat);
5866 match0 = pat;
5867 }
5868 else {
5869 beg0 = BEG(0);
5870 end0 = END(0);
5871 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5872 }
5873
5874 if (mode) {
5875 if (mode == ITER) {
5876 val = rb_obj_as_string(rb_yield(match0));
5877 }
5878 else {
5879 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5880 val = rb_obj_as_string(val);
5881 }
5882 str_mod_check(str, sp, slen);
5883 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5884 rb_raise(rb_eRuntimeError, "block should not cheat");
5885 }
5886 }
5887 else if (need_backref) {
5888 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5889 if (need_backref < 0) {
5890 need_backref = val != repl;
5891 }
5892 }
5893 else {
5894 val = repl;
5895 }
5896
5897 len = beg0 - offset; /* copy pre-match substr */
5898 if (len) {
5899 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5900 }
5901
5902 rb_str_buf_append(dest, val);
5903
5904 last = offset;
5905 offset = end0;
5906 if (beg0 == end0) {
5907 /*
5908 * Always consume at least one character of the input string
5909 * in order to prevent infinite loops.
5910 */
5911 if (RSTRING_LEN(str) <= end0) break;
5912 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5913 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5914 offset = end0 + len;
5915 }
5916 cp = RSTRING_PTR(str) + offset;
5917 if (offset > RSTRING_LEN(str)) break;
5918 beg = rb_pat_search(pat, str, offset, need_backref);
5919 } while (beg >= 0);
5920 if (RSTRING_LEN(str) > offset) {
5921 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5922 }
5923 rb_pat_search(pat, str, last, 1);
5924 if (bang) {
5925 str_shared_replace(str, dest);
5926 }
5927 else {
5928 str = dest;
5929 }
5930
5931 return str;
5932}
5933
5934
5935/*
5936 * call-seq:
5937 * gsub!(pattern, replacement) -> self or nil
5938 * gsub!(pattern) {|match| ... } -> self or nil
5939 * gsub!(pattern) -> an_enumerator
5940 *
5941 * Performs the specified substring replacement(s) on +self+;
5942 * returns +self+ if any replacement occurred, +nil+ otherwise.
5943 *
5944 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5945 *
5946 * Returns an Enumerator if no +replacement+ and no block given.
5947 *
5948 * Related: String#sub, String#gsub, String#sub!.
5949 *
5950 */
5951
5952static VALUE
5953rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5954{
5955 str_modify_keep_cr(str);
5956 return str_gsub(argc, argv, str, 1);
5957}
5958
5959
5960/*
5961 * call-seq:
5962 * gsub(pattern, replacement) -> new_string
5963 * gsub(pattern) {|match| ... } -> new_string
5964 * gsub(pattern) -> enumerator
5965 *
5966 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5967 *
5968 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5969 *
5970 * Returns an Enumerator if no +replacement+ and no block given.
5971 *
5972 * Related: String#sub, String#sub!, String#gsub!.
5973 *
5974 */
5975
5976static VALUE
5977rb_str_gsub(int argc, VALUE *argv, VALUE str)
5978{
5979 return str_gsub(argc, argv, str, 0);
5980}
5981
5982
5983/*
5984 * call-seq:
5985 * replace(other_string) -> self
5986 *
5987 * Replaces the contents of +self+ with the contents of +other_string+:
5988 *
5989 * s = 'foo' # => "foo"
5990 * s.replace('bar') # => "bar"
5991 *
5992 */
5993
5994VALUE
5996{
5997 str_modifiable(str);
5998 if (str == str2) return str;
5999
6000 StringValue(str2);
6001 str_discard(str);
6002 return str_replace(str, str2);
6003}
6004
6005/*
6006 * call-seq:
6007 * clear -> self
6008 *
6009 * Removes the contents of +self+:
6010 *
6011 * s = 'foo' # => "foo"
6012 * s.clear # => ""
6013 *
6014 */
6015
6016static VALUE
6017rb_str_clear(VALUE str)
6018{
6019 str_discard(str);
6020 STR_SET_EMBED(str);
6021 STR_SET_EMBED_LEN(str, 0);
6022 RSTRING_PTR(str)[0] = 0;
6023 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6025 else
6027 return str;
6028}
6029
6030/*
6031 * call-seq:
6032 * chr -> string
6033 *
6034 * Returns a string containing the first character of +self+:
6035 *
6036 * s = 'foo' # => "foo"
6037 * s.chr # => "f"
6038 *
6039 */
6040
6041static VALUE
6042rb_str_chr(VALUE str)
6043{
6044 return rb_str_substr(str, 0, 1);
6045}
6046
6047/*
6048 * call-seq:
6049 * getbyte(index) -> integer or nil
6050 *
6051 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6052 *
6053 * s = 'abcde' # => "abcde"
6054 * s.getbyte(0) # => 97
6055 * s.getbyte(-1) # => 101
6056 * s.getbyte(5) # => nil
6057 *
6058 * Related: String#setbyte.
6059 */
6060static VALUE
6061rb_str_getbyte(VALUE str, VALUE index)
6062{
6063 long pos = NUM2LONG(index);
6064
6065 if (pos < 0)
6066 pos += RSTRING_LEN(str);
6067 if (pos < 0 || RSTRING_LEN(str) <= pos)
6068 return Qnil;
6069
6070 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6071}
6072
6073/*
6074 * call-seq:
6075 * setbyte(index, integer) -> integer
6076 *
6077 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6078 *
6079 * s = 'abcde' # => "abcde"
6080 * s.setbyte(0, 98) # => 98
6081 * s # => "bbcde"
6082 *
6083 * Related: String#getbyte.
6084 */
6085static VALUE
6086rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6087{
6088 long pos = NUM2LONG(index);
6089 long len = RSTRING_LEN(str);
6090 char *ptr, *head, *left = 0;
6091 rb_encoding *enc;
6092 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6093
6094 if (pos < -len || len <= pos)
6095 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6096 if (pos < 0)
6097 pos += len;
6098
6099 VALUE v = rb_to_int(value);
6100 VALUE w = rb_int_and(v, INT2FIX(0xff));
6101 char byte = (char)(NUM2INT(w) & 0xFF);
6102
6103 if (!str_independent(str))
6104 str_make_independent(str);
6105 enc = STR_ENC_GET(str);
6106 head = RSTRING_PTR(str);
6107 ptr = &head[pos];
6108 if (!STR_EMBED_P(str)) {
6109 cr = ENC_CODERANGE(str);
6110 switch (cr) {
6111 case ENC_CODERANGE_7BIT:
6112 left = ptr;
6113 *ptr = byte;
6114 if (ISASCII(byte)) goto end;
6115 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6116 if (!MBCLEN_CHARFOUND_P(nlen))
6118 else
6120 goto end;
6122 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6123 width = rb_enc_precise_mbclen(left, head+len, enc);
6124 *ptr = byte;
6125 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6126 if (!MBCLEN_CHARFOUND_P(nlen))
6128 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6130 goto end;
6131 }
6132 }
6134 *ptr = byte;
6135
6136 end:
6137 return value;
6138}
6139
6140static VALUE
6141str_byte_substr(VALUE str, long beg, long len, int empty)
6142{
6143 long n = RSTRING_LEN(str);
6144
6145 if (beg > n || len < 0) return Qnil;
6146 if (beg < 0) {
6147 beg += n;
6148 if (beg < 0) return Qnil;
6149 }
6150 if (len > n - beg)
6151 len = n - beg;
6152 if (len <= 0) {
6153 if (!empty) return Qnil;
6154 len = 0;
6155 }
6156
6157 VALUE str2 = str_subseq(str, beg, len);
6158
6159 str_enc_copy(str2, str);
6160
6161 if (RSTRING_LEN(str2) == 0) {
6162 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6164 else
6166 }
6167 else {
6168 switch (ENC_CODERANGE(str)) {
6169 case ENC_CODERANGE_7BIT:
6171 break;
6172 default:
6174 break;
6175 }
6176 }
6177
6178 return str2;
6179}
6180
6181static VALUE
6182str_byte_aref(VALUE str, VALUE indx)
6183{
6184 long idx;
6185 if (FIXNUM_P(indx)) {
6186 idx = FIX2LONG(indx);
6187 }
6188 else {
6189 /* check if indx is Range */
6190 long beg, len = RSTRING_LEN(str);
6191
6192 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6193 case Qfalse:
6194 break;
6195 case Qnil:
6196 return Qnil;
6197 default:
6198 return str_byte_substr(str, beg, len, TRUE);
6199 }
6200
6201 idx = NUM2LONG(indx);
6202 }
6203 return str_byte_substr(str, idx, 1, FALSE);
6204}
6205
6206/*
6207 * call-seq:
6208 * byteslice(index, length = 1) -> string or nil
6209 * byteslice(range) -> string or nil
6210 *
6211 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6212 *
6213 * With integer arguments +index+ and +length+ given,
6214 * returns the substring beginning at the given +index+
6215 * of the given +length+ (if possible),
6216 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6217 *
6218 * s = '0123456789' # => "0123456789"
6219 * s.byteslice(2) # => "2"
6220 * s.byteslice(200) # => nil
6221 * s.byteslice(4, 3) # => "456"
6222 * s.byteslice(4, 30) # => "456789"
6223 * s.byteslice(4, -1) # => nil
6224 * s.byteslice(40, 2) # => nil
6225 *
6226 * In either case above, counts backwards from the end of +self+
6227 * if +index+ is negative:
6228 *
6229 * s = '0123456789' # => "0123456789"
6230 * s.byteslice(-4) # => "6"
6231 * s.byteslice(-4, 3) # => "678"
6232 *
6233 * With Range argument +range+ given, returns
6234 * <tt>byteslice(range.begin, range.size)</tt>:
6235 *
6236 * s = '0123456789' # => "0123456789"
6237 * s.byteslice(4..6) # => "456"
6238 * s.byteslice(-6..-4) # => "456"
6239 * s.byteslice(5..2) # => "" # range.size is zero.
6240 * s.byteslice(40..42) # => nil
6241 *
6242 * In all cases, a returned string has the same encoding as +self+:
6243 *
6244 * s.encoding # => #<Encoding:UTF-8>
6245 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6246 *
6247 */
6248
6249static VALUE
6250rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6251{
6252 if (argc == 2) {
6253 long beg = NUM2LONG(argv[0]);
6254 long len = NUM2LONG(argv[1]);
6255 return str_byte_substr(str, beg, len, TRUE);
6256 }
6257 rb_check_arity(argc, 1, 2);
6258 return str_byte_aref(str, argv[0]);
6259}
6260
6261/*
6262 * call-seq:
6263 * bytesplice(index, length, str) -> string
6264 * bytesplice(range, str) -> string
6265 *
6266 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6267 * The portion of the string affected is determined using
6268 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6269 * If the replacement string is not the same length as the text it is replacing,
6270 * the string will be adjusted accordingly.
6271 * The form that take an Integer will raise an IndexError if the value is out
6272 * of range; the Range form will raise a RangeError.
6273 * If the beginning or ending offset does not land on character (codepoint)
6274 * boundary, an IndexError will be raised.
6275 */
6276
6277static VALUE
6278rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6279{
6280 long beg, end, len, slen;
6281 VALUE val;
6282 rb_encoding *enc;
6283 int cr;
6284
6285 rb_check_arity(argc, 2, 3);
6286 if (argc == 2) {
6287 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6288 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6289 rb_builtin_class_name(argv[0]));
6290 }
6291 val = argv[1];
6292 }
6293 else {
6294 beg = NUM2LONG(argv[0]);
6295 len = NUM2LONG(argv[1]);
6296 val = argv[2];
6297 }
6298 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6299 slen = RSTRING_LEN(str);
6300 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6301 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6302 }
6303 if (beg < 0) {
6304 beg += slen;
6305 }
6306 assert(beg >= 0);
6307 assert(beg <= slen);
6308 if (len > slen - beg) {
6309 len = slen - beg;
6310 }
6311 end = beg + len;
6312 if (!str_check_byte_pos(str, beg)) {
6314 "offset %ld does not land on character boundary", beg);
6315 }
6316 if (!str_check_byte_pos(str, end)) {
6318 "offset %ld does not land on character boundary", end);
6319 }
6320 StringValue(val);
6321 enc = rb_enc_check(str, val);
6322 str_modify_keep_cr(str);
6323 rb_str_splice_0(str, beg, len, val);
6324 rb_enc_associate(str, enc);
6326 if (cr != ENC_CODERANGE_BROKEN)
6327 ENC_CODERANGE_SET(str, cr);
6328 return str;
6329}
6330
6331/*
6332 * call-seq:
6333 * reverse -> string
6334 *
6335 * Returns a new string with the characters from +self+ in reverse order.
6336 *
6337 * 'stressed'.reverse # => "desserts"
6338 *
6339 */
6340
6341static VALUE
6342rb_str_reverse(VALUE str)
6343{
6344 rb_encoding *enc;
6345 VALUE rev;
6346 char *s, *e, *p;
6347 int cr;
6348
6349 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6350 enc = STR_ENC_GET(str);
6351 rev = rb_str_new(0, RSTRING_LEN(str));
6352 s = RSTRING_PTR(str); e = RSTRING_END(str);
6353 p = RSTRING_END(rev);
6354 cr = ENC_CODERANGE(str);
6355
6356 if (RSTRING_LEN(str) > 1) {
6357 if (single_byte_optimizable(str)) {
6358 while (s < e) {
6359 *--p = *s++;
6360 }
6361 }
6362 else if (cr == ENC_CODERANGE_VALID) {
6363 while (s < e) {
6364 int clen = rb_enc_fast_mbclen(s, e, enc);
6365
6366 p -= clen;
6367 memcpy(p, s, clen);
6368 s += clen;
6369 }
6370 }
6371 else {
6372 cr = rb_enc_asciicompat(enc) ?
6374 while (s < e) {
6375 int clen = rb_enc_mbclen(s, e, enc);
6376
6377 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6378 p -= clen;
6379 memcpy(p, s, clen);
6380 s += clen;
6381 }
6382 }
6383 }
6384 STR_SET_LEN(rev, RSTRING_LEN(str));
6385 str_enc_copy(rev, str);
6386 ENC_CODERANGE_SET(rev, cr);
6387
6388 return rev;
6389}
6390
6391
6392/*
6393 * call-seq:
6394 * reverse! -> self
6395 *
6396 * Returns +self+ with its characters reversed:
6397 *
6398 * s = 'stressed'
6399 * s.reverse! # => "desserts"
6400 * s # => "desserts"
6401 *
6402 */
6403
6404static VALUE
6405rb_str_reverse_bang(VALUE str)
6406{
6407 if (RSTRING_LEN(str) > 1) {
6408 if (single_byte_optimizable(str)) {
6409 char *s, *e, c;
6410
6411 str_modify_keep_cr(str);
6412 s = RSTRING_PTR(str);
6413 e = RSTRING_END(str) - 1;
6414 while (s < e) {
6415 c = *s;
6416 *s++ = *e;
6417 *e-- = c;
6418 }
6419 }
6420 else {
6421 str_shared_replace(str, rb_str_reverse(str));
6422 }
6423 }
6424 else {
6425 str_modify_keep_cr(str);
6426 }
6427 return str;
6428}
6429
6430
6431/*
6432 * call-seq:
6433 * include? other_string -> true or false
6434 *
6435 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6436 *
6437 * s = 'foo'
6438 * s.include?('f') # => true
6439 * s.include?('fo') # => true
6440 * s.include?('food') # => false
6441 *
6442 */
6443
6444VALUE
6445rb_str_include(VALUE str, VALUE arg)
6446{
6447 long i;
6448
6449 StringValue(arg);
6450 i = rb_str_index(str, arg, 0);
6451
6452 return RBOOL(i != -1);
6453}
6454
6455
6456/*
6457 * call-seq:
6458 * to_i(base = 10) -> integer
6459 *
6460 * Returns the result of interpreting leading characters in +self+
6461 * as an integer in the given +base+ (which must be in (0, 2..36)):
6462 *
6463 * '123456'.to_i # => 123456
6464 * '123def'.to_i(16) # => 1195503
6465 *
6466 * With +base+ zero, string +object+ may contain leading characters
6467 * to specify the actual base:
6468 *
6469 * '123def'.to_i(0) # => 123
6470 * '0123def'.to_i(0) # => 83
6471 * '0b123def'.to_i(0) # => 1
6472 * '0o123def'.to_i(0) # => 83
6473 * '0d123def'.to_i(0) # => 123
6474 * '0x123def'.to_i(0) # => 1195503
6475 *
6476 * Characters past a leading valid number (in the given +base+) are ignored:
6477 *
6478 * '12.345'.to_i # => 12
6479 * '12345'.to_i(2) # => 1
6480 *
6481 * Returns zero if there is no leading valid number:
6482 *
6483 * 'abcdef'.to_i # => 0
6484 * '2'.to_i(2) # => 0
6485 *
6486 */
6487
6488static VALUE
6489rb_str_to_i(int argc, VALUE *argv, VALUE str)
6490{
6491 int base = 10;
6492
6493 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6494 rb_raise(rb_eArgError, "invalid radix %d", base);
6495 }
6496 return rb_str_to_inum(str, base, FALSE);
6497}
6498
6499
6500/*
6501 * call-seq:
6502 * to_f -> float
6503 *
6504 * Returns the result of interpreting leading characters in +self+ as a Float:
6505 *
6506 * '3.14159'.to_f # => 3.14159
6507 '1.234e-2'.to_f # => 0.01234
6508 *
6509 * Characters past a leading valid number (in the given +base+) are ignored:
6510 *
6511 * '3.14 (pi to two places)'.to_f # => 3.14
6512 *
6513 * Returns zero if there is no leading valid number:
6514 *
6515 * 'abcdef'.to_f # => 0.0
6516 *
6517 */
6518
6519static VALUE
6520rb_str_to_f(VALUE str)
6521{
6522 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6523}
6524
6525
6526/*
6527 * call-seq:
6528 * to_s -> self or string
6529 *
6530 * Returns +self+ if +self+ is a \String,
6531 * or +self+ converted to a \String if +self+ is a subclass of \String.
6532 *
6533 * String#to_str is an alias for String#to_s.
6534 *
6535 */
6536
6537static VALUE
6538rb_str_to_s(VALUE str)
6539{
6540 if (rb_obj_class(str) != rb_cString) {
6541 return str_duplicate(rb_cString, str);
6542 }
6543 return str;
6544}
6545
6546#if 0
6547static void
6548str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6549{
6550 char s[RUBY_MAX_CHAR_LEN];
6551 int n = rb_enc_codelen(c, enc);
6552
6553 rb_enc_mbcput(c, s, enc);
6554 rb_enc_str_buf_cat(str, s, n, enc);
6555}
6556#endif
6557
6558#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6559
6560int
6561rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6562{
6563 char buf[CHAR_ESC_LEN + 1];
6564 int l;
6565
6566#if SIZEOF_INT > 4
6567 c &= 0xffffffff;
6568#endif
6569 if (unicode_p) {
6570 if (c < 0x7F && ISPRINT(c)) {
6571 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6572 }
6573 else if (c < 0x10000) {
6574 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6575 }
6576 else {
6577 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6578 }
6579 }
6580 else {
6581 if (c < 0x100) {
6582 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6583 }
6584 else {
6585 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6586 }
6587 }
6588 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6589 rb_str_buf_cat(result, buf, l);
6590 return l;
6591}
6592
6593const char *
6594ruby_escaped_char(int c)
6595{
6596 switch (c) {
6597 case '\0': return "\\0";
6598 case '\n': return "\\n";
6599 case '\r': return "\\r";
6600 case '\t': return "\\t";
6601 case '\f': return "\\f";
6602 case '\013': return "\\v";
6603 case '\010': return "\\b";
6604 case '\007': return "\\a";
6605 case '\033': return "\\e";
6606 case '\x7f': return "\\c?";
6607 }
6608 return NULL;
6609}
6610
6611VALUE
6612rb_str_escape(VALUE str)
6613{
6614 int encidx = ENCODING_GET(str);
6615 rb_encoding *enc = rb_enc_from_index(encidx);
6616 const char *p = RSTRING_PTR(str);
6617 const char *pend = RSTRING_END(str);
6618 const char *prev = p;
6619 char buf[CHAR_ESC_LEN + 1];
6620 VALUE result = rb_str_buf_new(0);
6621 int unicode_p = rb_enc_unicode_p(enc);
6622 int asciicompat = rb_enc_asciicompat(enc);
6623
6624 while (p < pend) {
6625 unsigned int c;
6626 const char *cc;
6627 int n = rb_enc_precise_mbclen(p, pend, enc);
6628 if (!MBCLEN_CHARFOUND_P(n)) {
6629 if (p > prev) str_buf_cat(result, prev, p - prev);
6630 n = rb_enc_mbminlen(enc);
6631 if (pend < p + n)
6632 n = (int)(pend - p);
6633 while (n--) {
6634 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6635 str_buf_cat(result, buf, strlen(buf));
6636 prev = ++p;
6637 }
6638 continue;
6639 }
6640 n = MBCLEN_CHARFOUND_LEN(n);
6641 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6642 p += n;
6643 cc = ruby_escaped_char(c);
6644 if (cc) {
6645 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6646 str_buf_cat(result, cc, strlen(cc));
6647 prev = p;
6648 }
6649 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6650 }
6651 else {
6652 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6653 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6654 prev = p;
6655 }
6656 }
6657 if (p > prev) str_buf_cat(result, prev, p - prev);
6658 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6659
6660 return result;
6661}
6662
6663/*
6664 * call-seq:
6665 * inspect -> string
6666 *
6667 * Returns a printable version of +self+, enclosed in double-quotes,
6668 * and with special characters escaped:
6669 *
6670 * s = "foo\tbar\tbaz\n"
6671 * s.inspect
6672 * # => "\"foo\\tbar\\tbaz\\n\""
6673 *
6674 */
6675
6676VALUE
6678{
6679 int encidx = ENCODING_GET(str);
6680 rb_encoding *enc = rb_enc_from_index(encidx);
6681 const char *p, *pend, *prev;
6682 char buf[CHAR_ESC_LEN + 1];
6683 VALUE result = rb_str_buf_new(0);
6684 rb_encoding *resenc = rb_default_internal_encoding();
6685 int unicode_p = rb_enc_unicode_p(enc);
6686 int asciicompat = rb_enc_asciicompat(enc);
6687
6688 if (resenc == NULL) resenc = rb_default_external_encoding();
6689 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6690 rb_enc_associate(result, resenc);
6691 str_buf_cat2(result, "\"");
6692
6693 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6694 prev = p;
6695 while (p < pend) {
6696 unsigned int c, cc;
6697 int n;
6698
6699 n = rb_enc_precise_mbclen(p, pend, enc);
6700 if (!MBCLEN_CHARFOUND_P(n)) {
6701 if (p > prev) str_buf_cat(result, prev, p - prev);
6702 n = rb_enc_mbminlen(enc);
6703 if (pend < p + n)
6704 n = (int)(pend - p);
6705 while (n--) {
6706 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6707 str_buf_cat(result, buf, strlen(buf));
6708 prev = ++p;
6709 }
6710 continue;
6711 }
6712 n = MBCLEN_CHARFOUND_LEN(n);
6713 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6714 p += n;
6715 if ((asciicompat || unicode_p) &&
6716 (c == '"'|| c == '\\' ||
6717 (c == '#' &&
6718 p < pend &&
6719 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6720 (cc = rb_enc_codepoint(p,pend,enc),
6721 (cc == '$' || cc == '@' || cc == '{'))))) {
6722 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6723 str_buf_cat2(result, "\\");
6724 if (asciicompat || enc == resenc) {
6725 prev = p - n;
6726 continue;
6727 }
6728 }
6729 switch (c) {
6730 case '\n': cc = 'n'; break;
6731 case '\r': cc = 'r'; break;
6732 case '\t': cc = 't'; break;
6733 case '\f': cc = 'f'; break;
6734 case '\013': cc = 'v'; break;
6735 case '\010': cc = 'b'; break;
6736 case '\007': cc = 'a'; break;
6737 case 033: cc = 'e'; break;
6738 default: cc = 0; break;
6739 }
6740 if (cc) {
6741 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6742 buf[0] = '\\';
6743 buf[1] = (char)cc;
6744 str_buf_cat(result, buf, 2);
6745 prev = p;
6746 continue;
6747 }
6748 /* The special casing of 0x85 (NEXT_LINE) here is because
6749 * Oniguruma historically treats it as printable, but it
6750 * doesn't match the print POSIX bracket class or character
6751 * property in regexps.
6752 *
6753 * See Ruby Bug #16842 for details:
6754 * https://bugs.ruby-lang.org/issues/16842
6755 */
6756 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6757 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6758 continue;
6759 }
6760 else {
6761 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6762 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6763 prev = p;
6764 continue;
6765 }
6766 }
6767 if (p > prev) str_buf_cat(result, prev, p - prev);
6768 str_buf_cat2(result, "\"");
6769
6770 return result;
6771}
6772
6773#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6774
6775/*
6776 * call-seq:
6777 * dump -> string
6778 *
6779 * Returns a printable version of +self+, enclosed in double-quotes,
6780 * with special characters escaped, and with non-printing characters
6781 * replaced by hexadecimal notation:
6782 *
6783 * "hello \n ''".dump # => "\"hello \\n ''\""
6784 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6785 *
6786 * Related: String#undump (inverse of String#dump).
6787 *
6788 */
6789
6790VALUE
6792{
6793 int encidx = rb_enc_get_index(str);
6794 rb_encoding *enc = rb_enc_from_index(encidx);
6795 long len;
6796 const char *p, *pend;
6797 char *q, *qend;
6798 VALUE result;
6799 int u8 = (encidx == rb_utf8_encindex());
6800 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6801
6802 len = 2; /* "" */
6803 if (!rb_enc_asciicompat(enc)) {
6804 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6805 len += strlen(enc->name);
6806 }
6807
6808 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6809 while (p < pend) {
6810 int clen;
6811 unsigned char c = *p++;
6812
6813 switch (c) {
6814 case '"': case '\\':
6815 case '\n': case '\r':
6816 case '\t': case '\f':
6817 case '\013': case '\010': case '\007': case '\033':
6818 clen = 2;
6819 break;
6820
6821 case '#':
6822 clen = IS_EVSTR(p, pend) ? 2 : 1;
6823 break;
6824
6825 default:
6826 if (ISPRINT(c)) {
6827 clen = 1;
6828 }
6829 else {
6830 if (u8 && c > 0x7F) { /* \u notation */
6831 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6832 if (MBCLEN_CHARFOUND_P(n)) {
6833 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6834 if (cc <= 0xFFFF)
6835 clen = 6; /* \uXXXX */
6836 else if (cc <= 0xFFFFF)
6837 clen = 9; /* \u{XXXXX} */
6838 else
6839 clen = 10; /* \u{XXXXXX} */
6840 p += MBCLEN_CHARFOUND_LEN(n)-1;
6841 break;
6842 }
6843 }
6844 clen = 4; /* \xNN */
6845 }
6846 break;
6847 }
6848
6849 if (clen > LONG_MAX - len) {
6850 rb_raise(rb_eRuntimeError, "string size too big");
6851 }
6852 len += clen;
6853 }
6854
6855 result = rb_str_new(0, len);
6856 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6857 q = RSTRING_PTR(result); qend = q + len + 1;
6858
6859 *q++ = '"';
6860 while (p < pend) {
6861 unsigned char c = *p++;
6862
6863 if (c == '"' || c == '\\') {
6864 *q++ = '\\';
6865 *q++ = c;
6866 }
6867 else if (c == '#') {
6868 if (IS_EVSTR(p, pend)) *q++ = '\\';
6869 *q++ = '#';
6870 }
6871 else if (c == '\n') {
6872 *q++ = '\\';
6873 *q++ = 'n';
6874 }
6875 else if (c == '\r') {
6876 *q++ = '\\';
6877 *q++ = 'r';
6878 }
6879 else if (c == '\t') {
6880 *q++ = '\\';
6881 *q++ = 't';
6882 }
6883 else if (c == '\f') {
6884 *q++ = '\\';
6885 *q++ = 'f';
6886 }
6887 else if (c == '\013') {
6888 *q++ = '\\';
6889 *q++ = 'v';
6890 }
6891 else if (c == '\010') {
6892 *q++ = '\\';
6893 *q++ = 'b';
6894 }
6895 else if (c == '\007') {
6896 *q++ = '\\';
6897 *q++ = 'a';
6898 }
6899 else if (c == '\033') {
6900 *q++ = '\\';
6901 *q++ = 'e';
6902 }
6903 else if (ISPRINT(c)) {
6904 *q++ = c;
6905 }
6906 else {
6907 *q++ = '\\';
6908 if (u8) {
6909 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6910 if (MBCLEN_CHARFOUND_P(n)) {
6911 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6912 p += n;
6913 if (cc <= 0xFFFF)
6914 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6915 else
6916 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6917 q += strlen(q);
6918 continue;
6919 }
6920 }
6921 snprintf(q, qend-q, "x%02X", c);
6922 q += 3;
6923 }
6924 }
6925 *q++ = '"';
6926 *q = '\0';
6927 if (!rb_enc_asciicompat(enc)) {
6928 snprintf(q, qend-q, nonascii_suffix, enc->name);
6929 encidx = rb_ascii8bit_encindex();
6930 }
6931 /* result from dump is ASCII */
6932 rb_enc_associate_index(result, encidx);
6934 return result;
6935}
6936
6937static int
6938unescape_ascii(unsigned int c)
6939{
6940 switch (c) {
6941 case 'n':
6942 return '\n';
6943 case 'r':
6944 return '\r';
6945 case 't':
6946 return '\t';
6947 case 'f':
6948 return '\f';
6949 case 'v':
6950 return '\13';
6951 case 'b':
6952 return '\010';
6953 case 'a':
6954 return '\007';
6955 case 'e':
6956 return 033;
6957 }
6959}
6960
6961static void
6962undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6963{
6964 const char *s = *ss;
6965 unsigned int c;
6966 int codelen;
6967 size_t hexlen;
6968 unsigned char buf[6];
6969 static rb_encoding *enc_utf8 = NULL;
6970
6971 switch (*s) {
6972 case '\\':
6973 case '"':
6974 case '#':
6975 rb_str_cat(undumped, s, 1); /* cat itself */
6976 s++;
6977 break;
6978 case 'n':
6979 case 'r':
6980 case 't':
6981 case 'f':
6982 case 'v':
6983 case 'b':
6984 case 'a':
6985 case 'e':
6986 *buf = unescape_ascii(*s);
6987 rb_str_cat(undumped, (char *)buf, 1);
6988 s++;
6989 break;
6990 case 'u':
6991 if (*binary) {
6992 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6993 }
6994 *utf8 = true;
6995 if (++s >= s_end) {
6996 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6997 }
6998 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6999 if (*penc != enc_utf8) {
7000 *penc = enc_utf8;
7001 rb_enc_associate(undumped, enc_utf8);
7002 }
7003 if (*s == '{') { /* handle \u{...} form */
7004 s++;
7005 for (;;) {
7006 if (s >= s_end) {
7007 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7008 }
7009 if (*s == '}') {
7010 s++;
7011 break;
7012 }
7013 if (ISSPACE(*s)) {
7014 s++;
7015 continue;
7016 }
7017 c = scan_hex(s, s_end-s, &hexlen);
7018 if (hexlen == 0 || hexlen > 6) {
7019 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7020 }
7021 if (c > 0x10ffff) {
7022 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7023 }
7024 if (0xd800 <= c && c <= 0xdfff) {
7025 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7026 }
7027 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7028 rb_str_cat(undumped, (char *)buf, codelen);
7029 s += hexlen;
7030 }
7031 }
7032 else { /* handle \uXXXX form */
7033 c = scan_hex(s, 4, &hexlen);
7034 if (hexlen != 4) {
7035 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7036 }
7037 if (0xd800 <= c && c <= 0xdfff) {
7038 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7039 }
7040 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7041 rb_str_cat(undumped, (char *)buf, codelen);
7042 s += hexlen;
7043 }
7044 break;
7045 case 'x':
7046 if (*utf8) {
7047 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7048 }
7049 *binary = true;
7050 if (++s >= s_end) {
7051 rb_raise(rb_eRuntimeError, "invalid hex escape");
7052 }
7053 *buf = scan_hex(s, 2, &hexlen);
7054 if (hexlen != 2) {
7055 rb_raise(rb_eRuntimeError, "invalid hex escape");
7056 }
7057 rb_str_cat(undumped, (char *)buf, 1);
7058 s += hexlen;
7059 break;
7060 default:
7061 rb_str_cat(undumped, s-1, 2);
7062 s++;
7063 }
7064
7065 *ss = s;
7066}
7067
7068static VALUE rb_str_is_ascii_only_p(VALUE str);
7069
7070/*
7071 * call-seq:
7072 * undump -> string
7073 *
7074 * Returns an unescaped version of +self+:
7075 *
7076 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7077 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7078 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7079 * s_undumped == s_orig # => true
7080 *
7081 * Related: String#dump (inverse of String#undump).
7082 *
7083 */
7084
7085static VALUE
7086str_undump(VALUE str)
7087{
7088 const char *s = RSTRING_PTR(str);
7089 const char *s_end = RSTRING_END(str);
7090 rb_encoding *enc = rb_enc_get(str);
7091 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7092 bool utf8 = false;
7093 bool binary = false;
7094 int w;
7095
7097 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7098 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7099 }
7100 if (!str_null_check(str, &w)) {
7101 rb_raise(rb_eRuntimeError, "string contains null byte");
7102 }
7103 if (RSTRING_LEN(str) < 2) goto invalid_format;
7104 if (*s != '"') goto invalid_format;
7105
7106 /* strip '"' at the start */
7107 s++;
7108
7109 for (;;) {
7110 if (s >= s_end) {
7111 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7112 }
7113
7114 if (*s == '"') {
7115 /* epilogue */
7116 s++;
7117 if (s == s_end) {
7118 /* ascii compatible dumped string */
7119 break;
7120 }
7121 else {
7122 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7123 static const char dup_suffix[] = ".dup";
7124 const char *encname;
7125 int encidx;
7126 ptrdiff_t size;
7127
7128 /* check separately for strings dumped by older versions */
7129 size = sizeof(dup_suffix) - 1;
7130 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7131
7132 size = sizeof(force_encoding_suffix) - 1;
7133 if (s_end - s <= size) goto invalid_format;
7134 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7135 s += size;
7136
7137 if (utf8) {
7138 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7139 }
7140
7141 encname = s;
7142 s = memchr(s, '"', s_end-s);
7143 size = s - encname;
7144 if (!s) goto invalid_format;
7145 if (s_end - s != 2) goto invalid_format;
7146 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7147
7148 encidx = rb_enc_find_index2(encname, (long)size);
7149 if (encidx < 0) {
7150 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7151 }
7152 rb_enc_associate_index(undumped, encidx);
7153 }
7154 break;
7155 }
7156
7157 if (*s == '\\') {
7158 s++;
7159 if (s >= s_end) {
7160 rb_raise(rb_eRuntimeError, "invalid escape");
7161 }
7162 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7163 }
7164 else {
7165 rb_str_cat(undumped, s++, 1);
7166 }
7167 }
7168
7169 return undumped;
7170invalid_format:
7171 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7172}
7173
7174static void
7175rb_str_check_dummy_enc(rb_encoding *enc)
7176{
7177 if (rb_enc_dummy_p(enc)) {
7178 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7179 rb_enc_name(enc));
7180 }
7181}
7182
7183static rb_encoding *
7184str_true_enc(VALUE str)
7185{
7186 rb_encoding *enc = STR_ENC_GET(str);
7187 rb_str_check_dummy_enc(enc);
7188 return enc;
7189}
7190
7191static OnigCaseFoldType
7192check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7193{
7194 if (argc==0)
7195 return flags;
7196 if (argc>2)
7197 rb_raise(rb_eArgError, "too many options");
7198 if (argv[0]==sym_turkic) {
7199 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7200 if (argc==2) {
7201 if (argv[1]==sym_lithuanian)
7202 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7203 else
7204 rb_raise(rb_eArgError, "invalid second option");
7205 }
7206 }
7207 else if (argv[0]==sym_lithuanian) {
7208 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7209 if (argc==2) {
7210 if (argv[1]==sym_turkic)
7211 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7212 else
7213 rb_raise(rb_eArgError, "invalid second option");
7214 }
7215 }
7216 else if (argc>1)
7217 rb_raise(rb_eArgError, "too many options");
7218 else if (argv[0]==sym_ascii)
7219 flags |= ONIGENC_CASE_ASCII_ONLY;
7220 else if (argv[0]==sym_fold) {
7221 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7222 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7223 else
7224 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7225 }
7226 else
7227 rb_raise(rb_eArgError, "invalid option");
7228 return flags;
7229}
7230
7231static inline bool
7232case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7233{
7234 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7235 return true;
7236 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7237}
7238
7239/* 16 should be long enough to absorb any kind of single character length increase */
7240#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7241#ifndef CASEMAP_DEBUG
7242# define CASEMAP_DEBUG 0
7243#endif
7244
7245struct mapping_buffer;
7246typedef struct mapping_buffer {
7247 size_t capa;
7248 size_t used;
7249 struct mapping_buffer *next;
7250 OnigUChar space[FLEX_ARY_LEN];
7252
7253static void
7254mapping_buffer_free(void *p)
7255{
7256 mapping_buffer *previous_buffer;
7257 mapping_buffer *current_buffer = p;
7258 while (current_buffer) {
7259 previous_buffer = current_buffer;
7260 current_buffer = current_buffer->next;
7261 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7262 }
7263}
7264
7265static const rb_data_type_t mapping_buffer_type = {
7266 "mapping_buffer",
7267 {0, mapping_buffer_free,}
7268};
7269
7270static VALUE
7271rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7272{
7273 VALUE target;
7274
7275 const OnigUChar *source_current, *source_end;
7276 int target_length = 0;
7277 VALUE buffer_anchor;
7278 mapping_buffer *current_buffer = 0;
7279 mapping_buffer **pre_buffer;
7280 size_t buffer_count = 0;
7281 int buffer_length_or_invalid;
7282
7283 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7284
7285 source_current = (OnigUChar*)RSTRING_PTR(source);
7286 source_end = (OnigUChar*)RSTRING_END(source);
7287
7288 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7289 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7290 while (source_current < source_end) {
7291 /* increase multiplier using buffer count to converge quickly */
7292 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7293 if (CASEMAP_DEBUG) {
7294 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7295 }
7296 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7297 *pre_buffer = current_buffer;
7298 pre_buffer = &current_buffer->next;
7299 current_buffer->next = NULL;
7300 current_buffer->capa = capa;
7301 buffer_length_or_invalid = enc->case_map(flags,
7302 &source_current, source_end,
7303 current_buffer->space,
7304 current_buffer->space+current_buffer->capa,
7305 enc);
7306 if (buffer_length_or_invalid < 0) {
7307 current_buffer = DATA_PTR(buffer_anchor);
7308 DATA_PTR(buffer_anchor) = 0;
7309 mapping_buffer_free(current_buffer);
7310 rb_raise(rb_eArgError, "input string invalid");
7311 }
7312 target_length += current_buffer->used = buffer_length_or_invalid;
7313 }
7314 if (CASEMAP_DEBUG) {
7315 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7316 }
7317
7318 if (buffer_count==1) {
7319 target = rb_str_new((const char*)current_buffer->space, target_length);
7320 }
7321 else {
7322 char *target_current;
7323
7324 target = rb_str_new(0, target_length);
7325 target_current = RSTRING_PTR(target);
7326 current_buffer = DATA_PTR(buffer_anchor);
7327 while (current_buffer) {
7328 memcpy(target_current, current_buffer->space, current_buffer->used);
7329 target_current += current_buffer->used;
7330 current_buffer = current_buffer->next;
7331 }
7332 }
7333 current_buffer = DATA_PTR(buffer_anchor);
7334 DATA_PTR(buffer_anchor) = 0;
7335 mapping_buffer_free(current_buffer);
7336
7337 RB_GC_GUARD(buffer_anchor);
7338
7339 /* TODO: check about string terminator character */
7340 str_enc_copy(target, source);
7341 /*ENC_CODERANGE_SET(mapped, cr);*/
7342
7343 return target;
7344}
7345
7346static VALUE
7347rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7348{
7349 const OnigUChar *source_current, *source_end;
7350 OnigUChar *target_current, *target_end;
7351 long old_length = RSTRING_LEN(source);
7352 int length_or_invalid;
7353
7354 if (old_length == 0) return Qnil;
7355
7356 source_current = (OnigUChar*)RSTRING_PTR(source);
7357 source_end = (OnigUChar*)RSTRING_END(source);
7358 if (source == target) {
7359 target_current = (OnigUChar*)source_current;
7360 target_end = (OnigUChar*)source_end;
7361 }
7362 else {
7363 target_current = (OnigUChar*)RSTRING_PTR(target);
7364 target_end = (OnigUChar*)RSTRING_END(target);
7365 }
7366
7367 length_or_invalid = onigenc_ascii_only_case_map(flags,
7368 &source_current, source_end,
7369 target_current, target_end, enc);
7370 if (length_or_invalid < 0)
7371 rb_raise(rb_eArgError, "input string invalid");
7372 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7373 fprintf(stderr, "problem with rb_str_ascii_casemap"
7374 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7375 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7376 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7377 }
7378
7379 str_enc_copy(target, source);
7380
7381 return target;
7382}
7383
7384static bool
7385upcase_single(VALUE str)
7386{
7387 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7388 bool modified = false;
7389
7390 while (s < send) {
7391 unsigned int c = *(unsigned char*)s;
7392
7393 if ('a' <= c && c <= 'z') {
7394 *s = 'A' + (c - 'a');
7395 modified = true;
7396 }
7397 s++;
7398 }
7399 return modified;
7400}
7401
7402/*
7403 * call-seq:
7404 * upcase!(*options) -> self or nil
7405 *
7406 * Upcases the characters in +self+;
7407 * returns +self+ if any changes were made, +nil+ otherwise:
7408 *
7409 * s = 'Hello World!' # => "Hello World!"
7410 * s.upcase! # => "HELLO WORLD!"
7411 * s # => "HELLO WORLD!"
7412 * s.upcase! # => nil
7413 *
7414 * The casing may be affected by the given +options+;
7415 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7416 *
7417 * Related: String#upcase, String#downcase, String#downcase!.
7418 *
7419 */
7420
7421static VALUE
7422rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7423{
7424 rb_encoding *enc;
7425 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7426
7427 flags = check_case_options(argc, argv, flags);
7428 str_modify_keep_cr(str);
7429 enc = str_true_enc(str);
7430 if (case_option_single_p(flags, enc, str)) {
7431 if (upcase_single(str))
7432 flags |= ONIGENC_CASE_MODIFIED;
7433 }
7434 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7435 rb_str_ascii_casemap(str, str, &flags, enc);
7436 else
7437 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7438
7439 if (ONIGENC_CASE_MODIFIED&flags) return str;
7440 return Qnil;
7441}
7442
7443
7444/*
7445 * call-seq:
7446 * upcase(*options) -> string
7447 *
7448 * Returns a string containing the upcased characters in +self+:
7449 *
7450 * s = 'Hello World!' # => "Hello World!"
7451 * s.upcase # => "HELLO WORLD!"
7452 *
7453 * The casing may be affected by the given +options+;
7454 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7455 *
7456 * Related: String#upcase!, String#downcase, String#downcase!.
7457 *
7458 */
7459
7460static VALUE
7461rb_str_upcase(int argc, VALUE *argv, VALUE str)
7462{
7463 rb_encoding *enc;
7464 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7465 VALUE ret;
7466
7467 flags = check_case_options(argc, argv, flags);
7468 enc = str_true_enc(str);
7469 if (case_option_single_p(flags, enc, str)) {
7470 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7471 str_enc_copy(ret, str);
7472 upcase_single(ret);
7473 }
7474 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7475 ret = rb_str_new(0, RSTRING_LEN(str));
7476 rb_str_ascii_casemap(str, ret, &flags, enc);
7477 }
7478 else {
7479 ret = rb_str_casemap(str, &flags, enc);
7480 }
7481
7482 return ret;
7483}
7484
7485static bool
7486downcase_single(VALUE str)
7487{
7488 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7489 bool modified = false;
7490
7491 while (s < send) {
7492 unsigned int c = *(unsigned char*)s;
7493
7494 if ('A' <= c && c <= 'Z') {
7495 *s = 'a' + (c - 'A');
7496 modified = true;
7497 }
7498 s++;
7499 }
7500
7501 return modified;
7502}
7503
7504/*
7505 * call-seq:
7506 * downcase!(*options) -> self or nil
7507 *
7508 * Downcases the characters in +self+;
7509 * returns +self+ if any changes were made, +nil+ otherwise:
7510 *
7511 * s = 'Hello World!' # => "Hello World!"
7512 * s.downcase! # => "hello world!"
7513 * s # => "hello world!"
7514 * s.downcase! # => nil
7515 *
7516 * The casing may be affected by the given +options+;
7517 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7518 *
7519 * Related: String#downcase, String#upcase, String#upcase!.
7520 *
7521 */
7522
7523static VALUE
7524rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7525{
7526 rb_encoding *enc;
7527 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7528
7529 flags = check_case_options(argc, argv, flags);
7530 str_modify_keep_cr(str);
7531 enc = str_true_enc(str);
7532 if (case_option_single_p(flags, enc, str)) {
7533 if (downcase_single(str))
7534 flags |= ONIGENC_CASE_MODIFIED;
7535 }
7536 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7537 rb_str_ascii_casemap(str, str, &flags, enc);
7538 else
7539 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7540
7541 if (ONIGENC_CASE_MODIFIED&flags) return str;
7542 return Qnil;
7543}
7544
7545
7546/*
7547 * call-seq:
7548 * downcase(*options) -> string
7549 *
7550 * Returns a string containing the downcased characters in +self+:
7551 *
7552 * s = 'Hello World!' # => "Hello World!"
7553 * s.downcase # => "hello world!"
7554 *
7555 * The casing may be affected by the given +options+;
7556 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7557 *
7558 * Related: String#downcase!, String#upcase, String#upcase!.
7559 *
7560 */
7561
7562static VALUE
7563rb_str_downcase(int argc, VALUE *argv, VALUE str)
7564{
7565 rb_encoding *enc;
7566 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7567 VALUE ret;
7568
7569 flags = check_case_options(argc, argv, flags);
7570 enc = str_true_enc(str);
7571 if (case_option_single_p(flags, enc, str)) {
7572 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7573 str_enc_copy(ret, str);
7574 downcase_single(ret);
7575 }
7576 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7577 ret = rb_str_new(0, RSTRING_LEN(str));
7578 rb_str_ascii_casemap(str, ret, &flags, enc);
7579 }
7580 else {
7581 ret = rb_str_casemap(str, &flags, enc);
7582 }
7583
7584 return ret;
7585}
7586
7587
7588/*
7589 * call-seq:
7590 * capitalize!(*options) -> self or nil
7591 *
7592 * Upcases the first character in +self+;
7593 * downcases the remaining characters;
7594 * returns +self+ if any changes were made, +nil+ otherwise:
7595 *
7596 * s = 'hello World!' # => "hello World!"
7597 * s.capitalize! # => "Hello world!"
7598 * s # => "Hello world!"
7599 * s.capitalize! # => nil
7600 *
7601 * The casing may be affected by the given +options+;
7602 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7603 *
7604 * Related: String#capitalize.
7605 *
7606 */
7607
7608static VALUE
7609rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7610{
7611 rb_encoding *enc;
7612 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7613
7614 flags = check_case_options(argc, argv, flags);
7615 str_modify_keep_cr(str);
7616 enc = str_true_enc(str);
7617 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7618 if (flags&ONIGENC_CASE_ASCII_ONLY)
7619 rb_str_ascii_casemap(str, str, &flags, enc);
7620 else
7621 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7622
7623 if (ONIGENC_CASE_MODIFIED&flags) return str;
7624 return Qnil;
7625}
7626
7627
7628/*
7629 * call-seq:
7630 * capitalize(*options) -> string
7631 *
7632 * Returns a string containing the characters in +self+;
7633 * the first character is upcased;
7634 * the remaining characters are downcased:
7635 *
7636 * s = 'hello World!' # => "hello World!"
7637 * s.capitalize # => "Hello world!"
7638 *
7639 * The casing may be affected by the given +options+;
7640 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7641 *
7642 * Related: String#capitalize!.
7643 *
7644 */
7645
7646static VALUE
7647rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7648{
7649 rb_encoding *enc;
7650 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7651 VALUE ret;
7652
7653 flags = check_case_options(argc, argv, flags);
7654 enc = str_true_enc(str);
7655 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7656 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7657 ret = rb_str_new(0, RSTRING_LEN(str));
7658 rb_str_ascii_casemap(str, ret, &flags, enc);
7659 }
7660 else {
7661 ret = rb_str_casemap(str, &flags, enc);
7662 }
7663 return ret;
7664}
7665
7666
7667/*
7668 * call-seq:
7669 * swapcase!(*options) -> self or nil
7670 *
7671 * Upcases each lowercase character in +self+;
7672 * downcases uppercase character;
7673 * returns +self+ if any changes were made, +nil+ otherwise:
7674 *
7675 * s = 'Hello World!' # => "Hello World!"
7676 * s.swapcase! # => "hELLO wORLD!"
7677 * s # => "hELLO wORLD!"
7678 * ''.swapcase! # => nil
7679 *
7680 * The casing may be affected by the given +options+;
7681 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7682 *
7683 * Related: String#swapcase.
7684 *
7685 */
7686
7687static VALUE
7688rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7689{
7690 rb_encoding *enc;
7691 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7692
7693 flags = check_case_options(argc, argv, flags);
7694 str_modify_keep_cr(str);
7695 enc = str_true_enc(str);
7696 if (flags&ONIGENC_CASE_ASCII_ONLY)
7697 rb_str_ascii_casemap(str, str, &flags, enc);
7698 else
7699 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7700
7701 if (ONIGENC_CASE_MODIFIED&flags) return str;
7702 return Qnil;
7703}
7704
7705
7706/*
7707 * call-seq:
7708 * swapcase(*options) -> string
7709 *
7710 * Returns a string containing the characters in +self+, with cases reversed;
7711 * each uppercase character is downcased;
7712 * each lowercase character is upcased:
7713 *
7714 * s = 'Hello World!' # => "Hello World!"
7715 * s.swapcase # => "hELLO wORLD!"
7716 *
7717 * The casing may be affected by the given +options+;
7718 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7719 *
7720 * Related: String#swapcase!.
7721 *
7722 */
7723
7724static VALUE
7725rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7726{
7727 rb_encoding *enc;
7728 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7729 VALUE ret;
7730
7731 flags = check_case_options(argc, argv, flags);
7732 enc = str_true_enc(str);
7733 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7734 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7735 ret = rb_str_new(0, RSTRING_LEN(str));
7736 rb_str_ascii_casemap(str, ret, &flags, enc);
7737 }
7738 else {
7739 ret = rb_str_casemap(str, &flags, enc);
7740 }
7741 return ret;
7742}
7743
7744typedef unsigned char *USTR;
7745
7746struct tr {
7747 int gen;
7748 unsigned int now, max;
7749 char *p, *pend;
7750};
7751
7752static unsigned int
7753trnext(struct tr *t, rb_encoding *enc)
7754{
7755 int n;
7756
7757 for (;;) {
7758 nextpart:
7759 if (!t->gen) {
7760 if (t->p == t->pend) return -1;
7761 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7762 t->p += n;
7763 }
7764 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7765 t->p += n;
7766 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7767 t->p += n;
7768 if (t->p < t->pend) {
7769 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7770 t->p += n;
7771 if (t->now > c) {
7772 if (t->now < 0x80 && c < 0x80) {
7774 "invalid range \"%c-%c\" in string transliteration",
7775 t->now, c);
7776 }
7777 else {
7778 rb_raise(rb_eArgError, "invalid range in string transliteration");
7779 }
7780 continue; /* not reached */
7781 }
7782 t->gen = 1;
7783 t->max = c;
7784 }
7785 }
7786 return t->now;
7787 }
7788 else {
7789 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7790 if (t->now == t->max) {
7791 t->gen = 0;
7792 goto nextpart;
7793 }
7794 }
7795 if (t->now < t->max) {
7796 return t->now;
7797 }
7798 else {
7799 t->gen = 0;
7800 return t->max;
7801 }
7802 }
7803 }
7804}
7805
7806static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7807
7808static VALUE
7809tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7810{
7811 const unsigned int errc = -1;
7812 unsigned int trans[256];
7813 rb_encoding *enc, *e1, *e2;
7814 struct tr trsrc, trrepl;
7815 int cflag = 0;
7816 unsigned int c, c0, last = 0;
7817 int modify = 0, i, l;
7818 unsigned char *s, *send;
7819 VALUE hash = 0;
7820 int singlebyte = single_byte_optimizable(str);
7821 int termlen;
7822 int cr;
7823
7824#define CHECK_IF_ASCII(c) \
7825 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7826 (cr = ENC_CODERANGE_VALID) : 0)
7827
7828 StringValue(src);
7829 StringValue(repl);
7830 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7831 if (RSTRING_LEN(repl) == 0) {
7832 return rb_str_delete_bang(1, &src, str);
7833 }
7834
7835 cr = ENC_CODERANGE(str);
7836 e1 = rb_enc_check(str, src);
7837 e2 = rb_enc_check(str, repl);
7838 if (e1 == e2) {
7839 enc = e1;
7840 }
7841 else {
7842 enc = rb_enc_check(src, repl);
7843 }
7844 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7845 if (RSTRING_LEN(src) > 1 &&
7846 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7847 trsrc.p + l < trsrc.pend) {
7848 cflag = 1;
7849 trsrc.p += l;
7850 }
7851 trrepl.p = RSTRING_PTR(repl);
7852 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7853 trsrc.gen = trrepl.gen = 0;
7854 trsrc.now = trrepl.now = 0;
7855 trsrc.max = trrepl.max = 0;
7856
7857 if (cflag) {
7858 for (i=0; i<256; i++) {
7859 trans[i] = 1;
7860 }
7861 while ((c = trnext(&trsrc, enc)) != errc) {
7862 if (c < 256) {
7863 trans[c] = errc;
7864 }
7865 else {
7866 if (!hash) hash = rb_hash_new();
7867 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7868 }
7869 }
7870 while ((c = trnext(&trrepl, enc)) != errc)
7871 /* retrieve last replacer */;
7872 last = trrepl.now;
7873 for (i=0; i<256; i++) {
7874 if (trans[i] != errc) {
7875 trans[i] = last;
7876 }
7877 }
7878 }
7879 else {
7880 unsigned int r;
7881
7882 for (i=0; i<256; i++) {
7883 trans[i] = errc;
7884 }
7885 while ((c = trnext(&trsrc, enc)) != errc) {
7886 r = trnext(&trrepl, enc);
7887 if (r == errc) r = trrepl.now;
7888 if (c < 256) {
7889 trans[c] = r;
7890 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7891 }
7892 else {
7893 if (!hash) hash = rb_hash_new();
7894 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7895 }
7896 }
7897 }
7898
7899 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7900 cr = ENC_CODERANGE_7BIT;
7901 str_modify_keep_cr(str);
7902 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7903 termlen = rb_enc_mbminlen(enc);
7904 if (sflag) {
7905 int clen, tlen;
7906 long offset, max = RSTRING_LEN(str);
7907 unsigned int save = -1;
7908 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7909
7910 while (s < send) {
7911 int may_modify = 0;
7912
7913 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7914 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7915
7916 s += clen;
7917 if (c < 256) {
7918 c = trans[c];
7919 }
7920 else if (hash) {
7921 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7922 if (NIL_P(tmp)) {
7923 if (cflag) c = last;
7924 else c = errc;
7925 }
7926 else if (cflag) c = errc;
7927 else c = NUM2INT(tmp);
7928 }
7929 else {
7930 c = errc;
7931 }
7932 if (c != (unsigned int)-1) {
7933 if (save == c) {
7934 CHECK_IF_ASCII(c);
7935 continue;
7936 }
7937 save = c;
7938 tlen = rb_enc_codelen(c, enc);
7939 modify = 1;
7940 }
7941 else {
7942 save = -1;
7943 c = c0;
7944 if (enc != e1) may_modify = 1;
7945 }
7946 if ((offset = t - buf) + tlen > max) {
7947 size_t MAYBE_UNUSED(old) = max + termlen;
7948 max = offset + tlen + (send - s);
7949 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7950 t = buf + offset;
7951 }
7952 rb_enc_mbcput(c, t, enc);
7953 if (may_modify && memcmp(s, t, tlen) != 0) {
7954 modify = 1;
7955 }
7956 CHECK_IF_ASCII(c);
7957 t += tlen;
7958 }
7959 if (!STR_EMBED_P(str)) {
7960 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7961 }
7962 TERM_FILL((char *)t, termlen);
7963 RSTRING(str)->as.heap.ptr = (char *)buf;
7964 RSTRING(str)->as.heap.len = t - buf;
7965 STR_SET_NOEMBED(str);
7966 RSTRING(str)->as.heap.aux.capa = max;
7967 }
7968 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7969 while (s < send) {
7970 c = (unsigned char)*s;
7971 if (trans[c] != errc) {
7972 if (!cflag) {
7973 c = trans[c];
7974 *s = c;
7975 modify = 1;
7976 }
7977 else {
7978 *s = last;
7979 modify = 1;
7980 }
7981 }
7982 CHECK_IF_ASCII(c);
7983 s++;
7984 }
7985 }
7986 else {
7987 int clen, tlen;
7988 long offset, max = (long)((send - s) * 1.2);
7989 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7990
7991 while (s < send) {
7992 int may_modify = 0;
7993 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7994 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7995
7996 if (c < 256) {
7997 c = trans[c];
7998 }
7999 else if (hash) {
8000 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8001 if (NIL_P(tmp)) {
8002 if (cflag) c = last;
8003 else c = errc;
8004 }
8005 else if (cflag) c = errc;
8006 else c = NUM2INT(tmp);
8007 }
8008 else {
8009 c = cflag ? last : errc;
8010 }
8011 if (c != errc) {
8012 tlen = rb_enc_codelen(c, enc);
8013 modify = 1;
8014 }
8015 else {
8016 c = c0;
8017 if (enc != e1) may_modify = 1;
8018 }
8019 if ((offset = t - buf) + tlen > max) {
8020 size_t MAYBE_UNUSED(old) = max + termlen;
8021 max = offset + tlen + (long)((send - s) * 1.2);
8022 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8023 t = buf + offset;
8024 }
8025 if (s != t) {
8026 rb_enc_mbcput(c, t, enc);
8027 if (may_modify && memcmp(s, t, tlen) != 0) {
8028 modify = 1;
8029 }
8030 }
8031 CHECK_IF_ASCII(c);
8032 s += clen;
8033 t += tlen;
8034 }
8035 if (!STR_EMBED_P(str)) {
8036 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8037 }
8038 TERM_FILL((char *)t, termlen);
8039 RSTRING(str)->as.heap.ptr = (char *)buf;
8040 RSTRING(str)->as.heap.len = t - buf;
8041 STR_SET_NOEMBED(str);
8042 RSTRING(str)->as.heap.aux.capa = max;
8043 }
8044
8045 if (modify) {
8046 if (cr != ENC_CODERANGE_BROKEN)
8047 ENC_CODERANGE_SET(str, cr);
8048 rb_enc_associate(str, enc);
8049 return str;
8050 }
8051 return Qnil;
8052}
8053
8054
8055/*
8056 * call-seq:
8057 * tr!(selector, replacements) -> self or nil
8058 *
8059 * Like String#tr, but modifies +self+ in place.
8060 * Returns +self+ if any changes were made, +nil+ otherwise.
8061 *
8062 */
8063
8064static VALUE
8065rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8066{
8067 return tr_trans(str, src, repl, 0);
8068}
8069
8070
8071/*
8072 * call-seq:
8073 * tr(selector, replacements) -> new_string
8074 *
8075 * Returns a copy of +self+ with each character specified by string +selector+
8076 * translated to the corresponding character in string +replacements+.
8077 * The correspondence is _positional_:
8078 *
8079 * - Each occurrence of the first character specified by +selector+
8080 * is translated to the first character in +replacements+.
8081 * - Each occurrence of the second character specified by +selector+
8082 * is translated to the second character in +replacements+.
8083 * - And so on.
8084 *
8085 * Example:
8086 *
8087 * 'hello'.tr('el', 'ip') #=> "hippo"
8088 *
8089 * If +replacements+ is shorter than +selector+,
8090 * it is implicitly padded with its own last character:
8091 *
8092 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8093 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8094 *
8095 * Arguments +selector+ and +replacements+ must be valid character selectors
8096 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8097 * and may use any of its valid forms, including negation, ranges, and escaping:
8098 *
8099 * # Negation.
8100 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8101 * # Ranges.
8102 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8103 * # Escapes.
8104 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8105 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8106 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8107 *
8108 */
8109
8110static VALUE
8111rb_str_tr(VALUE str, VALUE src, VALUE repl)
8112{
8113 str = str_duplicate(rb_cString, str);
8114 tr_trans(str, src, repl, 0);
8115 return str;
8116}
8117
8118#define TR_TABLE_MAX (UCHAR_MAX+1)
8119#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8120static void
8121tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8122 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8123{
8124 const unsigned int errc = -1;
8125 char buf[TR_TABLE_MAX];
8126 struct tr tr;
8127 unsigned int c;
8128 VALUE table = 0, ptable = 0;
8129 int i, l, cflag = 0;
8130
8131 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8132 tr.gen = tr.now = tr.max = 0;
8133
8134 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8135 cflag = 1;
8136 tr.p += l;
8137 }
8138 if (first) {
8139 for (i=0; i<TR_TABLE_MAX; i++) {
8140 stable[i] = 1;
8141 }
8142 stable[TR_TABLE_MAX] = cflag;
8143 }
8144 else if (stable[TR_TABLE_MAX] && !cflag) {
8145 stable[TR_TABLE_MAX] = 0;
8146 }
8147 for (i=0; i<TR_TABLE_MAX; i++) {
8148 buf[i] = cflag;
8149 }
8150
8151 while ((c = trnext(&tr, enc)) != errc) {
8152 if (c < TR_TABLE_MAX) {
8153 buf[(unsigned char)c] = !cflag;
8154 }
8155 else {
8156 VALUE key = UINT2NUM(c);
8157
8158 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8159 if (cflag) {
8160 ptable = *ctablep;
8161 table = ptable ? ptable : rb_hash_new();
8162 *ctablep = table;
8163 }
8164 else {
8165 table = rb_hash_new();
8166 ptable = *tablep;
8167 *tablep = table;
8168 }
8169 }
8170 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8171 rb_hash_aset(table, key, Qtrue);
8172 }
8173 }
8174 }
8175 for (i=0; i<TR_TABLE_MAX; i++) {
8176 stable[i] = stable[i] && buf[i];
8177 }
8178 if (!table && !cflag) {
8179 *tablep = 0;
8180 }
8181}
8182
8183
8184static int
8185tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8186{
8187 if (c < TR_TABLE_MAX) {
8188 return table[c] != 0;
8189 }
8190 else {
8191 VALUE v = UINT2NUM(c);
8192
8193 if (del) {
8194 if (!NIL_P(rb_hash_lookup(del, v)) &&
8195 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8196 return TRUE;
8197 }
8198 }
8199 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8200 return FALSE;
8201 }
8202 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8203 }
8204}
8205
8206/*
8207 * call-seq:
8208 * delete!(*selectors) -> self or nil
8209 *
8210 * Like String#delete, but modifies +self+ in place.
8211 * Returns +self+ if any changes were made, +nil+ otherwise.
8212 *
8213 */
8214
8215static VALUE
8216rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8217{
8218 char squeez[TR_TABLE_SIZE];
8219 rb_encoding *enc = 0;
8220 char *s, *send, *t;
8221 VALUE del = 0, nodel = 0;
8222 int modify = 0;
8223 int i, ascompat, cr;
8224
8225 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8226 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8227 for (i=0; i<argc; i++) {
8228 VALUE s = argv[i];
8229
8230 StringValue(s);
8231 enc = rb_enc_check(str, s);
8232 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8233 }
8234
8235 str_modify_keep_cr(str);
8236 ascompat = rb_enc_asciicompat(enc);
8237 s = t = RSTRING_PTR(str);
8238 send = RSTRING_END(str);
8239 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8240 while (s < send) {
8241 unsigned int c;
8242 int clen;
8243
8244 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8245 if (squeez[c]) {
8246 modify = 1;
8247 }
8248 else {
8249 if (t != s) *t = c;
8250 t++;
8251 }
8252 s++;
8253 }
8254 else {
8255 c = rb_enc_codepoint_len(s, send, &clen, enc);
8256
8257 if (tr_find(c, squeez, del, nodel)) {
8258 modify = 1;
8259 }
8260 else {
8261 if (t != s) rb_enc_mbcput(c, t, enc);
8262 t += clen;
8264 }
8265 s += clen;
8266 }
8267 }
8268 TERM_FILL(t, TERM_LEN(str));
8269 STR_SET_LEN(str, t - RSTRING_PTR(str));
8270 ENC_CODERANGE_SET(str, cr);
8271
8272 if (modify) return str;
8273 return Qnil;
8274}
8275
8276
8277/*
8278 * call-seq:
8279 * delete(*selectors) -> new_string
8280 *
8281 * Returns a copy of +self+ with characters specified by +selectors+ removed
8282 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8283 *
8284 * "hello".delete "l","lo" #=> "heo"
8285 * "hello".delete "lo" #=> "he"
8286 * "hello".delete "aeiou", "^e" #=> "hell"
8287 * "hello".delete "ej-m" #=> "ho"
8288 *
8289 */
8290
8291static VALUE
8292rb_str_delete(int argc, VALUE *argv, VALUE str)
8293{
8294 str = str_duplicate(rb_cString, str);
8295 rb_str_delete_bang(argc, argv, str);
8296 return str;
8297}
8298
8299
8300/*
8301 * call-seq:
8302 * squeeze!(*selectors) -> self or nil
8303 *
8304 * Like String#squeeze, but modifies +self+ in place.
8305 * Returns +self+ if any changes were made, +nil+ otherwise.
8306 */
8307
8308static VALUE
8309rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8310{
8311 char squeez[TR_TABLE_SIZE];
8312 rb_encoding *enc = 0;
8313 VALUE del = 0, nodel = 0;
8314 unsigned char *s, *send, *t;
8315 int i, modify = 0;
8316 int ascompat, singlebyte = single_byte_optimizable(str);
8317 unsigned int save;
8318
8319 if (argc == 0) {
8320 enc = STR_ENC_GET(str);
8321 }
8322 else {
8323 for (i=0; i<argc; i++) {
8324 VALUE s = argv[i];
8325
8326 StringValue(s);
8327 enc = rb_enc_check(str, s);
8328 if (singlebyte && !single_byte_optimizable(s))
8329 singlebyte = 0;
8330 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8331 }
8332 }
8333
8334 str_modify_keep_cr(str);
8335 s = t = (unsigned char *)RSTRING_PTR(str);
8336 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8337 send = (unsigned char *)RSTRING_END(str);
8338 save = -1;
8339 ascompat = rb_enc_asciicompat(enc);
8340
8341 if (singlebyte) {
8342 while (s < send) {
8343 unsigned int c = *s++;
8344 if (c != save || (argc > 0 && !squeez[c])) {
8345 *t++ = save = c;
8346 }
8347 }
8348 }
8349 else {
8350 while (s < send) {
8351 unsigned int c;
8352 int clen;
8353
8354 if (ascompat && (c = *s) < 0x80) {
8355 if (c != save || (argc > 0 && !squeez[c])) {
8356 *t++ = save = c;
8357 }
8358 s++;
8359 }
8360 else {
8361 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8362
8363 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8364 if (t != s) rb_enc_mbcput(c, t, enc);
8365 save = c;
8366 t += clen;
8367 }
8368 s += clen;
8369 }
8370 }
8371 }
8372
8373 TERM_FILL((char *)t, TERM_LEN(str));
8374 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8375 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8376 modify = 1;
8377 }
8378
8379 if (modify) return str;
8380 return Qnil;
8381}
8382
8383
8384/*
8385 * call-seq:
8386 * squeeze(*selectors) -> new_string
8387 *
8388 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8389 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8390 *
8391 * "Squeezed" means that each multiple-character run of a selected character
8392 * is squeezed down to a single character;
8393 * with no arguments given, squeezes all characters:
8394 *
8395 * "yellow moon".squeeze #=> "yelow mon"
8396 * " now is the".squeeze(" ") #=> " now is the"
8397 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8398 *
8399 */
8400
8401static VALUE
8402rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8403{
8404 str = str_duplicate(rb_cString, str);
8405 rb_str_squeeze_bang(argc, argv, str);
8406 return str;
8407}
8408
8409
8410/*
8411 * call-seq:
8412 * tr_s!(selector, replacements) -> self or nil
8413 *
8414 * Like String#tr_s, but modifies +self+ in place.
8415 * Returns +self+ if any changes were made, +nil+ otherwise.
8416 *
8417 * Related: String#squeeze!.
8418 */
8419
8420static VALUE
8421rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8422{
8423 return tr_trans(str, src, repl, 1);
8424}
8425
8426
8427/*
8428 * call-seq:
8429 * tr_s(selector, replacements) -> string
8430 *
8431 * Like String#tr, but also squeezes the modified portions of the translated string;
8432 * returns a new string (translated and squeezed).
8433 *
8434 * 'hello'.tr_s('l', 'r') #=> "hero"
8435 * 'hello'.tr_s('el', '-') #=> "h-o"
8436 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8437 *
8438 * Related: String#squeeze.
8439 *
8440 */
8441
8442static VALUE
8443rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8444{
8445 str = str_duplicate(rb_cString, str);
8446 tr_trans(str, src, repl, 1);
8447 return str;
8448}
8449
8450
8451/*
8452 * call-seq:
8453 * count(*selectors) -> integer
8454 *
8455 * Returns the total number of characters in +self+
8456 * that are specified by the given +selectors+
8457 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8458 *
8459 * a = "hello world"
8460 * a.count "lo" #=> 5
8461 * a.count "lo", "o" #=> 2
8462 * a.count "hello", "^l" #=> 4
8463 * a.count "ej-m" #=> 4
8464 *
8465 * "hello^world".count "\\^aeiou" #=> 4
8466 * "hello-world".count "a\\-eo" #=> 4
8467 *
8468 * c = "hello world\\r\\n"
8469 * c.count "\\" #=> 2
8470 * c.count "\\A" #=> 0
8471 * c.count "X-\\w" #=> 3
8472 */
8473
8474static VALUE
8475rb_str_count(int argc, VALUE *argv, VALUE str)
8476{
8477 char table[TR_TABLE_SIZE];
8478 rb_encoding *enc = 0;
8479 VALUE del = 0, nodel = 0, tstr;
8480 char *s, *send;
8481 int i;
8482 int ascompat;
8483 size_t n = 0;
8484
8485 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8486
8487 tstr = argv[0];
8488 StringValue(tstr);
8489 enc = rb_enc_check(str, tstr);
8490 if (argc == 1) {
8491 const char *ptstr;
8492 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8493 (ptstr = RSTRING_PTR(tstr),
8494 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8495 !is_broken_string(str)) {
8496 int clen;
8497 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8498
8499 s = RSTRING_PTR(str);
8500 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8501 send = RSTRING_END(str);
8502 while (s < send) {
8503 if (*(unsigned char*)s++ == c) n++;
8504 }
8505 return SIZET2NUM(n);
8506 }
8507 }
8508
8509 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8510 for (i=1; i<argc; i++) {
8511 tstr = argv[i];
8512 StringValue(tstr);
8513 enc = rb_enc_check(str, tstr);
8514 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8515 }
8516
8517 s = RSTRING_PTR(str);
8518 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8519 send = RSTRING_END(str);
8520 ascompat = rb_enc_asciicompat(enc);
8521 while (s < send) {
8522 unsigned int c;
8523
8524 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8525 if (table[c]) {
8526 n++;
8527 }
8528 s++;
8529 }
8530 else {
8531 int clen;
8532 c = rb_enc_codepoint_len(s, send, &clen, enc);
8533 if (tr_find(c, table, del, nodel)) {
8534 n++;
8535 }
8536 s += clen;
8537 }
8538 }
8539
8540 return SIZET2NUM(n);
8541}
8542
8543static VALUE
8544rb_fs_check(VALUE val)
8545{
8546 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8547 val = rb_check_string_type(val);
8548 if (NIL_P(val)) return 0;
8549 }
8550 return val;
8551}
8552
8553static const char isspacetable[256] = {
8554 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8556 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8559 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8564 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8565 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8566 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8567 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8568 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8569 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8570};
8571
8572#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8573
8574static long
8575split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8576{
8577 if (empty_count >= 0 && len == 0) {
8578 return empty_count + 1;
8579 }
8580 if (empty_count > 0) {
8581 /* make different substrings */
8582 if (result) {
8583 do {
8584 rb_ary_push(result, str_new_empty_String(str));
8585 } while (--empty_count > 0);
8586 }
8587 else {
8588 do {
8589 rb_yield(str_new_empty_String(str));
8590 } while (--empty_count > 0);
8591 }
8592 }
8593 str = rb_str_subseq(str, beg, len);
8594 if (result) {
8595 rb_ary_push(result, str);
8596 }
8597 else {
8598 rb_yield(str);
8599 }
8600 return empty_count;
8601}
8602
8603typedef enum {
8604 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8605} split_type_t;
8606
8607static split_type_t
8608literal_split_pattern(VALUE spat, split_type_t default_type)
8609{
8610 rb_encoding *enc = STR_ENC_GET(spat);
8611 const char *ptr;
8612 long len;
8613 RSTRING_GETMEM(spat, ptr, len);
8614 if (len == 0) {
8615 /* Special case - split into chars */
8616 return SPLIT_TYPE_CHARS;
8617 }
8618 else if (rb_enc_asciicompat(enc)) {
8619 if (len == 1 && ptr[0] == ' ') {
8620 return SPLIT_TYPE_AWK;
8621 }
8622 }
8623 else {
8624 int l;
8625 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8626 return SPLIT_TYPE_AWK;
8627 }
8628 }
8629 return default_type;
8630}
8631
8632/*
8633 * call-seq:
8634 * split(field_sep = $;, limit = nil) -> array
8635 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8636 *
8637 * :include: doc/string/split.rdoc
8638 *
8639 */
8640
8641static VALUE
8642rb_str_split_m(int argc, VALUE *argv, VALUE str)
8643{
8644 rb_encoding *enc;
8645 VALUE spat;
8646 VALUE limit;
8647 split_type_t split_type;
8648 long beg, end, i = 0, empty_count = -1;
8649 int lim = 0;
8650 VALUE result, tmp;
8651
8652 result = rb_block_given_p() ? Qfalse : Qnil;
8653 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8654 lim = NUM2INT(limit);
8655 if (lim <= 0) limit = Qnil;
8656 else if (lim == 1) {
8657 if (RSTRING_LEN(str) == 0)
8658 return result ? rb_ary_new2(0) : str;
8659 tmp = str_duplicate(rb_cString, str);
8660 if (!result) {
8661 rb_yield(tmp);
8662 return str;
8663 }
8664 return rb_ary_new3(1, tmp);
8665 }
8666 i = 1;
8667 }
8668 if (NIL_P(limit) && !lim) empty_count = 0;
8669
8670 enc = STR_ENC_GET(str);
8671 split_type = SPLIT_TYPE_REGEXP;
8672 if (!NIL_P(spat)) {
8673 spat = get_pat_quoted(spat, 0);
8674 }
8675 else if (NIL_P(spat = rb_fs)) {
8676 split_type = SPLIT_TYPE_AWK;
8677 }
8678 else if (!(spat = rb_fs_check(spat))) {
8679 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8680 }
8681 else {
8682 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8683 }
8684 if (split_type != SPLIT_TYPE_AWK) {
8685 switch (BUILTIN_TYPE(spat)) {
8686 case T_REGEXP:
8687 rb_reg_options(spat); /* check if uninitialized */
8688 tmp = RREGEXP_SRC(spat);
8689 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8690 if (split_type == SPLIT_TYPE_AWK) {
8691 spat = tmp;
8692 split_type = SPLIT_TYPE_STRING;
8693 }
8694 break;
8695
8696 case T_STRING:
8697 mustnot_broken(spat);
8698 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8699 break;
8700
8701 default:
8703 }
8704 }
8705
8706#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8707
8708 if (result) result = rb_ary_new();
8709 beg = 0;
8710 char *ptr = RSTRING_PTR(str);
8711 char *eptr = RSTRING_END(str);
8712 if (split_type == SPLIT_TYPE_AWK) {
8713 char *bptr = ptr;
8714 int skip = 1;
8715 unsigned int c;
8716
8717 end = beg;
8718 if (is_ascii_string(str)) {
8719 while (ptr < eptr) {
8720 c = (unsigned char)*ptr++;
8721 if (skip) {
8722 if (ascii_isspace(c)) {
8723 beg = ptr - bptr;
8724 }
8725 else {
8726 end = ptr - bptr;
8727 skip = 0;
8728 if (!NIL_P(limit) && lim <= i) break;
8729 }
8730 }
8731 else if (ascii_isspace(c)) {
8732 SPLIT_STR(beg, end-beg);
8733 skip = 1;
8734 beg = ptr - bptr;
8735 if (!NIL_P(limit)) ++i;
8736 }
8737 else {
8738 end = ptr - bptr;
8739 }
8740 }
8741 }
8742 else {
8743 while (ptr < eptr) {
8744 int n;
8745
8746 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8747 ptr += n;
8748 if (skip) {
8749 if (rb_isspace(c)) {
8750 beg = ptr - bptr;
8751 }
8752 else {
8753 end = ptr - bptr;
8754 skip = 0;
8755 if (!NIL_P(limit) && lim <= i) break;
8756 }
8757 }
8758 else if (rb_isspace(c)) {
8759 SPLIT_STR(beg, end-beg);
8760 skip = 1;
8761 beg = ptr - bptr;
8762 if (!NIL_P(limit)) ++i;
8763 }
8764 else {
8765 end = ptr - bptr;
8766 }
8767 }
8768 }
8769 }
8770 else if (split_type == SPLIT_TYPE_STRING) {
8771 char *str_start = ptr;
8772 char *substr_start = ptr;
8773 char *sptr = RSTRING_PTR(spat);
8774 long slen = RSTRING_LEN(spat);
8775
8776 mustnot_broken(str);
8777 enc = rb_enc_check(str, spat);
8778 while (ptr < eptr &&
8779 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8780 /* Check we are at the start of a char */
8781 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8782 if (t != ptr + end) {
8783 ptr = t;
8784 continue;
8785 }
8786 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8787 ptr += end + slen;
8788 substr_start = ptr;
8789 if (!NIL_P(limit) && lim <= ++i) break;
8790 }
8791 beg = ptr - str_start;
8792 }
8793 else if (split_type == SPLIT_TYPE_CHARS) {
8794 char *str_start = ptr;
8795 int n;
8796
8797 mustnot_broken(str);
8798 enc = rb_enc_get(str);
8799 while (ptr < eptr &&
8800 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8801 SPLIT_STR(ptr - str_start, n);
8802 ptr += n;
8803 if (!NIL_P(limit) && lim <= ++i) break;
8804 }
8805 beg = ptr - str_start;
8806 }
8807 else {
8808 long len = RSTRING_LEN(str);
8809 long start = beg;
8810 long idx;
8811 int last_null = 0;
8812 struct re_registers *regs;
8813 VALUE match = 0;
8814
8815 for (; rb_reg_search(spat, str, start, 0) >= 0;
8816 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8817 match = rb_backref_get();
8818 if (!result) rb_match_busy(match);
8819 regs = RMATCH_REGS(match);
8820 end = BEG(0);
8821 if (start == end && BEG(0) == END(0)) {
8822 if (!ptr) {
8823 SPLIT_STR(0, 0);
8824 break;
8825 }
8826 else if (last_null == 1) {
8827 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8828 beg = start;
8829 }
8830 else {
8831 if (start == len)
8832 start++;
8833 else
8834 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8835 last_null = 1;
8836 continue;
8837 }
8838 }
8839 else {
8840 SPLIT_STR(beg, end-beg);
8841 beg = start = END(0);
8842 }
8843 last_null = 0;
8844
8845 for (idx=1; idx < regs->num_regs; idx++) {
8846 if (BEG(idx) == -1) continue;
8847 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8848 }
8849 if (!NIL_P(limit) && lim <= ++i) break;
8850 }
8851 if (match) rb_match_unbusy(match);
8852 }
8853 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8854 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8855 }
8856
8857 return result ? result : str;
8858}
8859
8860VALUE
8861rb_str_split(VALUE str, const char *sep0)
8862{
8863 VALUE sep;
8864
8865 StringValue(str);
8866 sep = rb_str_new_cstr(sep0);
8867 return rb_str_split_m(1, &sep, str);
8868}
8869
8870#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8871
8872static inline int
8873enumerator_element(VALUE ary, VALUE e)
8874{
8875 if (ary) {
8876 rb_ary_push(ary, e);
8877 return 0;
8878 }
8879 else {
8880 rb_yield(e);
8881 return 1;
8882 }
8883}
8884
8885#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8886
8887static const char *
8888chomp_newline(const char *p, const char *e, rb_encoding *enc)
8889{
8890 const char *prev = rb_enc_prev_char(p, e, e, enc);
8891 if (rb_enc_is_newline(prev, e, enc)) {
8892 e = prev;
8893 prev = rb_enc_prev_char(p, e, e, enc);
8894 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8895 e = prev;
8896 }
8897 return e;
8898}
8899
8900static VALUE
8901get_rs(void)
8902{
8903 VALUE rs = rb_rs;
8904 if (!NIL_P(rs) &&
8905 (!RB_TYPE_P(rs, T_STRING) ||
8906 RSTRING_LEN(rs) != 1 ||
8907 RSTRING_PTR(rs)[0] != '\n')) {
8908 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8909 }
8910 return rs;
8911}
8912
8913#define rb_rs get_rs()
8914
8915static VALUE
8916rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8917{
8918 rb_encoding *enc;
8919 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8920 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8921 long pos, len, rslen;
8922 int rsnewline = 0;
8923
8924 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8925 rs = rb_rs;
8926 if (!NIL_P(opts)) {
8927 static ID keywords[1];
8928 if (!keywords[0]) {
8929 keywords[0] = rb_intern_const("chomp");
8930 }
8931 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8932 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
8933 }
8934
8935 if (NIL_P(rs)) {
8936 if (!ENUM_ELEM(ary, str)) {
8937 return ary;
8938 }
8939 else {
8940 return orig;
8941 }
8942 }
8943
8944 if (!RSTRING_LEN(str)) goto end;
8945 str = rb_str_new_frozen(str);
8946 ptr = subptr = RSTRING_PTR(str);
8947 pend = RSTRING_END(str);
8948 len = RSTRING_LEN(str);
8949 StringValue(rs);
8950 rslen = RSTRING_LEN(rs);
8951
8952 if (rs == rb_default_rs)
8953 enc = rb_enc_get(str);
8954 else
8955 enc = rb_enc_check(str, rs);
8956
8957 if (rslen == 0) {
8958 /* paragraph mode */
8959 int n;
8960 const char *eol = NULL;
8961 subend = subptr;
8962 while (subend < pend) {
8963 long chomp_rslen = 0;
8964 do {
8965 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8966 n = 0;
8967 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8968 if (rb_enc_is_newline(subend + n, pend, enc)) {
8969 if (eol == subend) break;
8970 subend += rslen;
8971 if (subptr) {
8972 eol = subend;
8973 chomp_rslen = -rslen;
8974 }
8975 }
8976 else {
8977 if (!subptr) subptr = subend;
8978 subend += rslen;
8979 }
8980 rslen = 0;
8981 } while (subend < pend);
8982 if (!subptr) break;
8983 if (rslen == 0) chomp_rslen = 0;
8984 line = rb_str_subseq(str, subptr - ptr,
8985 subend - subptr + (chomp ? chomp_rslen : rslen));
8986 if (ENUM_ELEM(ary, line)) {
8987 str_mod_check(str, ptr, len);
8988 }
8989 subptr = eol = NULL;
8990 }
8991 goto end;
8992 }
8993 else {
8994 rsptr = RSTRING_PTR(rs);
8995 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8996 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8997 rsnewline = 1;
8998 }
8999 }
9000
9001 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9002 rs = rb_str_new(rsptr, rslen);
9003 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9004 rsptr = RSTRING_PTR(rs);
9005 rslen = RSTRING_LEN(rs);
9006 }
9007
9008 while (subptr < pend) {
9009 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9010 if (pos < 0) break;
9011 hit = subptr + pos;
9012 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9013 if (hit != adjusted) {
9014 subptr = adjusted;
9015 continue;
9016 }
9017 subend = hit += rslen;
9018 if (chomp) {
9019 if (rsnewline) {
9020 subend = chomp_newline(subptr, subend, enc);
9021 }
9022 else {
9023 subend -= rslen;
9024 }
9025 }
9026 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9027 if (ENUM_ELEM(ary, line)) {
9028 str_mod_check(str, ptr, len);
9029 }
9030 subptr = hit;
9031 }
9032
9033 if (subptr != pend) {
9034 if (chomp) {
9035 if (rsnewline) {
9036 pend = chomp_newline(subptr, pend, enc);
9037 }
9038 else if (pend - subptr >= rslen &&
9039 memcmp(pend - rslen, rsptr, rslen) == 0) {
9040 pend -= rslen;
9041 }
9042 }
9043 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9044 ENUM_ELEM(ary, line);
9045 RB_GC_GUARD(str);
9046 }
9047
9048 end:
9049 if (ary)
9050 return ary;
9051 else
9052 return orig;
9053}
9054
9055/*
9056 * call-seq:
9057 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9058 * each_line(line_sep = $/, chomp: false) -> enumerator
9059 *
9060 * :include: doc/string/each_line.rdoc
9061 *
9062 */
9063
9064static VALUE
9065rb_str_each_line(int argc, VALUE *argv, VALUE str)
9066{
9067 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9068 return rb_str_enumerate_lines(argc, argv, str, 0);
9069}
9070
9071/*
9072 * call-seq:
9073 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9074 *
9075 * Forms substrings ("lines") of +self+ according to the given arguments
9076 * (see String#each_line for details); returns the lines in an array.
9077 *
9078 */
9079
9080static VALUE
9081rb_str_lines(int argc, VALUE *argv, VALUE str)
9082{
9083 VALUE ary = WANTARRAY("lines", 0);
9084 return rb_str_enumerate_lines(argc, argv, str, ary);
9085}
9086
9087static VALUE
9088rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9089{
9090 return LONG2FIX(RSTRING_LEN(str));
9091}
9092
9093static VALUE
9094rb_str_enumerate_bytes(VALUE str, VALUE ary)
9095{
9096 long i;
9097
9098 for (i=0; i<RSTRING_LEN(str); i++) {
9099 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9100 }
9101 if (ary)
9102 return ary;
9103 else
9104 return str;
9105}
9106
9107/*
9108 * call-seq:
9109 * each_byte {|byte| ... } -> self
9110 * each_byte -> enumerator
9111 *
9112 * :include: doc/string/each_byte.rdoc
9113 *
9114 */
9115
9116static VALUE
9117rb_str_each_byte(VALUE str)
9118{
9119 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9120 return rb_str_enumerate_bytes(str, 0);
9121}
9122
9123/*
9124 * call-seq:
9125 * bytes -> array_of_bytes
9126 *
9127 * :include: doc/string/bytes.rdoc
9128 *
9129 */
9130
9131static VALUE
9132rb_str_bytes(VALUE str)
9133{
9134 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9135 return rb_str_enumerate_bytes(str, ary);
9136}
9137
9138static VALUE
9139rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9140{
9141 return rb_str_length(str);
9142}
9143
9144static VALUE
9145rb_str_enumerate_chars(VALUE str, VALUE ary)
9146{
9147 VALUE orig = str;
9148 long i, len, n;
9149 const char *ptr;
9150 rb_encoding *enc;
9151
9152 str = rb_str_new_frozen(str);
9153 ptr = RSTRING_PTR(str);
9154 len = RSTRING_LEN(str);
9155 enc = rb_enc_get(str);
9156
9158 for (i = 0; i < len; i += n) {
9159 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9160 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9161 }
9162 }
9163 else {
9164 for (i = 0; i < len; i += n) {
9165 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9166 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9167 }
9168 }
9169 RB_GC_GUARD(str);
9170 if (ary)
9171 return ary;
9172 else
9173 return orig;
9174}
9175
9176/*
9177 * call-seq:
9178 * each_char {|c| ... } -> self
9179 * each_char -> enumerator
9180 *
9181 * :include: doc/string/each_char.rdoc
9182 *
9183 */
9184
9185static VALUE
9186rb_str_each_char(VALUE str)
9187{
9188 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9189 return rb_str_enumerate_chars(str, 0);
9190}
9191
9192/*
9193 * call-seq:
9194 * chars -> array_of_characters
9195 *
9196 * :include: doc/string/chars.rdoc
9197 *
9198 */
9199
9200static VALUE
9201rb_str_chars(VALUE str)
9202{
9203 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9204 return rb_str_enumerate_chars(str, ary);
9205}
9206
9207static VALUE
9208rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9209{
9210 VALUE orig = str;
9211 int n;
9212 unsigned int c;
9213 const char *ptr, *end;
9214 rb_encoding *enc;
9215
9216 if (single_byte_optimizable(str))
9217 return rb_str_enumerate_bytes(str, ary);
9218
9219 str = rb_str_new_frozen(str);
9220 ptr = RSTRING_PTR(str);
9221 end = RSTRING_END(str);
9222 enc = STR_ENC_GET(str);
9223
9224 while (ptr < end) {
9225 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9226 ENUM_ELEM(ary, UINT2NUM(c));
9227 ptr += n;
9228 }
9229 RB_GC_GUARD(str);
9230 if (ary)
9231 return ary;
9232 else
9233 return orig;
9234}
9235
9236/*
9237 * call-seq:
9238 * each_codepoint {|integer| ... } -> self
9239 * each_codepoint -> enumerator
9240 *
9241 * :include: doc/string/each_codepoint.rdoc
9242 *
9243 */
9244
9245static VALUE
9246rb_str_each_codepoint(VALUE str)
9247{
9248 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9249 return rb_str_enumerate_codepoints(str, 0);
9250}
9251
9252/*
9253 * call-seq:
9254 * codepoints -> array_of_integers
9255 *
9256 * :include: doc/string/codepoints.rdoc
9257 *
9258 */
9259
9260static VALUE
9261rb_str_codepoints(VALUE str)
9262{
9263 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9264 return rb_str_enumerate_codepoints(str, ary);
9265}
9266
9267static regex_t *
9268get_reg_grapheme_cluster(rb_encoding *enc)
9269{
9270 int encidx = rb_enc_to_index(enc);
9271 regex_t *reg_grapheme_cluster = NULL;
9272 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9273
9274 /* synchronize */
9275 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9276 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9277 }
9278 if (!reg_grapheme_cluster) {
9279 const OnigUChar source_ascii[] = "\\X";
9280 OnigErrorInfo einfo;
9281 const OnigUChar *source = source_ascii;
9282 size_t source_len = sizeof(source_ascii) - 1;
9283 switch (encidx) {
9284#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9285#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9286#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9287#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9288#define CASE_UTF(e) \
9289 case ENCINDEX_UTF_##e: { \
9290 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9291 source = source_UTF_##e; \
9292 source_len = sizeof(source_UTF_##e); \
9293 break; \
9294 }
9295 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9296#undef CASE_UTF
9297#undef CHARS_16BE
9298#undef CHARS_16LE
9299#undef CHARS_32BE
9300#undef CHARS_32LE
9301 }
9302 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9303 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9304 if (r) {
9305 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9306 onig_error_code_to_str(message, r, &einfo);
9307 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9308 }
9309 if (encidx == rb_utf8_encindex()) {
9310 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9311 }
9312 }
9313 return reg_grapheme_cluster;
9314}
9315
9316static VALUE
9317rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9318{
9319 size_t grapheme_cluster_count = 0;
9320 regex_t *reg_grapheme_cluster = NULL;
9321 rb_encoding *enc = get_encoding(str);
9322 const char *ptr, *end;
9323
9324 if (!rb_enc_unicode_p(enc)) {
9325 return rb_str_length(str);
9326 }
9327
9328 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9329 ptr = RSTRING_PTR(str);
9330 end = RSTRING_END(str);
9331
9332 while (ptr < end) {
9333 OnigPosition len = onig_match(reg_grapheme_cluster,
9334 (const OnigUChar *)ptr, (const OnigUChar *)end,
9335 (const OnigUChar *)ptr, NULL, 0);
9336 if (len <= 0) break;
9337 grapheme_cluster_count++;
9338 ptr += len;
9339 }
9340
9341 return SIZET2NUM(grapheme_cluster_count);
9342}
9343
9344static VALUE
9345rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9346{
9347 VALUE orig = str;
9348 regex_t *reg_grapheme_cluster = NULL;
9349 rb_encoding *enc = get_encoding(str);
9350 const char *ptr0, *ptr, *end;
9351
9352 if (!rb_enc_unicode_p(enc)) {
9353 return rb_str_enumerate_chars(str, ary);
9354 }
9355
9356 if (!ary) str = rb_str_new_frozen(str);
9357 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9358 ptr0 = ptr = RSTRING_PTR(str);
9359 end = RSTRING_END(str);
9360
9361 while (ptr < end) {
9362 OnigPosition len = onig_match(reg_grapheme_cluster,
9363 (const OnigUChar *)ptr, (const OnigUChar *)end,
9364 (const OnigUChar *)ptr, NULL, 0);
9365 if (len <= 0) break;
9366 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9367 ptr += len;
9368 }
9369 RB_GC_GUARD(str);
9370 if (ary)
9371 return ary;
9372 else
9373 return orig;
9374}
9375
9376/*
9377 * call-seq:
9378 * each_grapheme_cluster {|gc| ... } -> self
9379 * each_grapheme_cluster -> enumerator
9380 *
9381 * :include: doc/string/each_grapheme_cluster.rdoc
9382 *
9383 */
9384
9385static VALUE
9386rb_str_each_grapheme_cluster(VALUE str)
9387{
9388 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9389 return rb_str_enumerate_grapheme_clusters(str, 0);
9390}
9391
9392/*
9393 * call-seq:
9394 * grapheme_clusters -> array_of_grapheme_clusters
9395 *
9396 * :include: doc/string/grapheme_clusters.rdoc
9397 *
9398 */
9399
9400static VALUE
9401rb_str_grapheme_clusters(VALUE str)
9402{
9403 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9404 return rb_str_enumerate_grapheme_clusters(str, ary);
9405}
9406
9407static long
9408chopped_length(VALUE str)
9409{
9410 rb_encoding *enc = STR_ENC_GET(str);
9411 const char *p, *p2, *beg, *end;
9412
9413 beg = RSTRING_PTR(str);
9414 end = beg + RSTRING_LEN(str);
9415 if (beg >= end) return 0;
9416 p = rb_enc_prev_char(beg, end, end, enc);
9417 if (!p) return 0;
9418 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9419 p2 = rb_enc_prev_char(beg, p, end, enc);
9420 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9421 }
9422 return p - beg;
9423}
9424
9425/*
9426 * call-seq:
9427 * chop! -> self or nil
9428 *
9429 * Like String#chop, but modifies +self+ in place;
9430 * returns +nil+ if +self+ is empty, +self+ otherwise.
9431 *
9432 * Related: String#chomp!.
9433 */
9434
9435static VALUE
9436rb_str_chop_bang(VALUE str)
9437{
9438 str_modify_keep_cr(str);
9439 if (RSTRING_LEN(str) > 0) {
9440 long len;
9441 len = chopped_length(str);
9442 STR_SET_LEN(str, len);
9443 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9444 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9446 }
9447 return str;
9448 }
9449 return Qnil;
9450}
9451
9452
9453/*
9454 * call-seq:
9455 * chop -> new_string
9456 *
9457 * :include: doc/string/chop.rdoc
9458 *
9459 */
9460
9461static VALUE
9462rb_str_chop(VALUE str)
9463{
9464 return rb_str_subseq(str, 0, chopped_length(str));
9465}
9466
9467static long
9468smart_chomp(VALUE str, const char *e, const char *p)
9469{
9470 rb_encoding *enc = rb_enc_get(str);
9471 if (rb_enc_mbminlen(enc) > 1) {
9472 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9473 if (rb_enc_is_newline(pp, e, enc)) {
9474 e = pp;
9475 }
9476 pp = e - rb_enc_mbminlen(enc);
9477 if (pp >= p) {
9478 pp = rb_enc_left_char_head(p, pp, e, enc);
9479 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9480 e = pp;
9481 }
9482 }
9483 }
9484 else {
9485 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9486 case '\n':
9487 if (--e > p && *(e-1) == '\r') {
9488 --e;
9489 }
9490 break;
9491 case '\r':
9492 --e;
9493 break;
9494 }
9495 }
9496 return e - p;
9497}
9498
9499static long
9500chompped_length(VALUE str, VALUE rs)
9501{
9502 rb_encoding *enc;
9503 int newline;
9504 char *pp, *e, *rsptr;
9505 long rslen;
9506 char *const p = RSTRING_PTR(str);
9507 long len = RSTRING_LEN(str);
9508
9509 if (len == 0) return 0;
9510 e = p + len;
9511 if (rs == rb_default_rs) {
9512 return smart_chomp(str, e, p);
9513 }
9514
9515 enc = rb_enc_get(str);
9516 RSTRING_GETMEM(rs, rsptr, rslen);
9517 if (rslen == 0) {
9518 if (rb_enc_mbminlen(enc) > 1) {
9519 while (e > p) {
9520 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9521 if (!rb_enc_is_newline(pp, e, enc)) break;
9522 e = pp;
9523 pp -= rb_enc_mbminlen(enc);
9524 if (pp >= p) {
9525 pp = rb_enc_left_char_head(p, pp, e, enc);
9526 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9527 e = pp;
9528 }
9529 }
9530 }
9531 }
9532 else {
9533 while (e > p && *(e-1) == '\n') {
9534 --e;
9535 if (e > p && *(e-1) == '\r')
9536 --e;
9537 }
9538 }
9539 return e - p;
9540 }
9541 if (rslen > len) return len;
9542
9543 enc = rb_enc_get(rs);
9544 newline = rsptr[rslen-1];
9545 if (rslen == rb_enc_mbminlen(enc)) {
9546 if (rslen == 1) {
9547 if (newline == '\n')
9548 return smart_chomp(str, e, p);
9549 }
9550 else {
9551 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9552 return smart_chomp(str, e, p);
9553 }
9554 }
9555
9556 enc = rb_enc_check(str, rs);
9557 if (is_broken_string(rs)) {
9558 return len;
9559 }
9560 pp = e - rslen;
9561 if (p[len-1] == newline &&
9562 (rslen <= 1 ||
9563 memcmp(rsptr, pp, rslen) == 0)) {
9564 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9565 return len - rslen;
9566 RB_GC_GUARD(rs);
9567 }
9568 return len;
9569}
9570
9576static VALUE
9577chomp_rs(int argc, const VALUE *argv)
9578{
9579 rb_check_arity(argc, 0, 1);
9580 if (argc > 0) {
9581 VALUE rs = argv[0];
9582 if (!NIL_P(rs)) StringValue(rs);
9583 return rs;
9584 }
9585 else {
9586 return rb_rs;
9587 }
9588}
9589
9590VALUE
9591rb_str_chomp_string(VALUE str, VALUE rs)
9592{
9593 long olen = RSTRING_LEN(str);
9594 long len = chompped_length(str, rs);
9595 if (len >= olen) return Qnil;
9596 str_modify_keep_cr(str);
9597 STR_SET_LEN(str, len);
9598 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9599 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9601 }
9602 return str;
9603}
9604
9605/*
9606 * call-seq:
9607 * chomp!(line_sep = $/) -> self or nil
9608 *
9609 * Like String#chomp, but modifies +self+ in place;
9610 * returns +nil+ if no modification made, +self+ otherwise.
9611 *
9612 */
9613
9614static VALUE
9615rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9616{
9617 VALUE rs;
9618 str_modifiable(str);
9619 if (RSTRING_LEN(str) == 0) return Qnil;
9620 rs = chomp_rs(argc, argv);
9621 if (NIL_P(rs)) return Qnil;
9622 return rb_str_chomp_string(str, rs);
9623}
9624
9625
9626/*
9627 * call-seq:
9628 * chomp(line_sep = $/) -> new_string
9629 *
9630 * :include: doc/string/chomp.rdoc
9631 *
9632 */
9633
9634static VALUE
9635rb_str_chomp(int argc, VALUE *argv, VALUE str)
9636{
9637 VALUE rs = chomp_rs(argc, argv);
9638 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9639 return rb_str_subseq(str, 0, chompped_length(str, rs));
9640}
9641
9642static long
9643lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9644{
9645 const char *const start = s;
9646
9647 if (!s || s >= e) return 0;
9648
9649 /* remove spaces at head */
9650 if (single_byte_optimizable(str)) {
9651 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9652 }
9653 else {
9654 while (s < e) {
9655 int n;
9656 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9657
9658 if (cc && !rb_isspace(cc)) break;
9659 s += n;
9660 }
9661 }
9662 return s - start;
9663}
9664
9665/*
9666 * call-seq:
9667 * lstrip! -> self or nil
9668 *
9669 * Like String#lstrip, except that any modifications are made in +self+;
9670 * returns +self+ if any modification are made, +nil+ otherwise.
9671 *
9672 * Related: String#rstrip!, String#strip!.
9673 */
9674
9675static VALUE
9676rb_str_lstrip_bang(VALUE str)
9677{
9678 rb_encoding *enc;
9679 char *start, *s;
9680 long olen, loffset;
9681
9682 str_modify_keep_cr(str);
9683 enc = STR_ENC_GET(str);
9684 RSTRING_GETMEM(str, start, olen);
9685 loffset = lstrip_offset(str, start, start+olen, enc);
9686 if (loffset > 0) {
9687 long len = olen-loffset;
9688 s = start + loffset;
9689 memmove(start, s, len);
9690 STR_SET_LEN(str, len);
9691 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9692 return str;
9693 }
9694 return Qnil;
9695}
9696
9697
9698/*
9699 * call-seq:
9700 * lstrip -> new_string
9701 *
9702 * Returns a copy of +self+ with leading whitespace removed;
9703 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9704 *
9705 * whitespace = "\x00\t\n\v\f\r "
9706 * s = whitespace + 'abc' + whitespace
9707 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9708 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9709 *
9710 * Related: String#rstrip, String#strip.
9711 */
9712
9713static VALUE
9714rb_str_lstrip(VALUE str)
9715{
9716 char *start;
9717 long len, loffset;
9718 RSTRING_GETMEM(str, start, len);
9719 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9720 if (loffset <= 0) return str_duplicate(rb_cString, str);
9721 return rb_str_subseq(str, loffset, len - loffset);
9722}
9723
9724static long
9725rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9726{
9727 const char *t;
9728
9729 rb_str_check_dummy_enc(enc);
9731 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9732 }
9733 if (!s || s >= e) return 0;
9734 t = e;
9735
9736 /* remove trailing spaces or '\0's */
9737 if (single_byte_optimizable(str)) {
9738 unsigned char c;
9739 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9740 }
9741 else {
9742 char *tp;
9743
9744 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9745 unsigned int c = rb_enc_codepoint(tp, e, enc);
9746 if (c && !rb_isspace(c)) break;
9747 t = tp;
9748 }
9749 }
9750 return e - t;
9751}
9752
9753/*
9754 * call-seq:
9755 * rstrip! -> self or nil
9756 *
9757 * Like String#rstrip, except that any modifications are made in +self+;
9758 * returns +self+ if any modification are made, +nil+ otherwise.
9759 *
9760 * Related: String#lstrip!, String#strip!.
9761 */
9762
9763static VALUE
9764rb_str_rstrip_bang(VALUE str)
9765{
9766 rb_encoding *enc;
9767 char *start;
9768 long olen, roffset;
9769
9770 str_modify_keep_cr(str);
9771 enc = STR_ENC_GET(str);
9772 RSTRING_GETMEM(str, start, olen);
9773 roffset = rstrip_offset(str, start, start+olen, enc);
9774 if (roffset > 0) {
9775 long len = olen - roffset;
9776
9777 STR_SET_LEN(str, len);
9778 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9779 return str;
9780 }
9781 return Qnil;
9782}
9783
9784
9785/*
9786 * call-seq:
9787 * rstrip -> new_string
9788 *
9789 * Returns a copy of the receiver with trailing whitespace removed;
9790 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9791 *
9792 * whitespace = "\x00\t\n\v\f\r "
9793 * s = whitespace + 'abc' + whitespace
9794 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9795 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9796 *
9797 * Related: String#lstrip, String#strip.
9798 */
9799
9800static VALUE
9801rb_str_rstrip(VALUE str)
9802{
9803 rb_encoding *enc;
9804 char *start;
9805 long olen, roffset;
9806
9807 enc = STR_ENC_GET(str);
9808 RSTRING_GETMEM(str, start, olen);
9809 roffset = rstrip_offset(str, start, start+olen, enc);
9810
9811 if (roffset <= 0) return str_duplicate(rb_cString, str);
9812 return rb_str_subseq(str, 0, olen-roffset);
9813}
9814
9815
9816/*
9817 * call-seq:
9818 * strip! -> self or nil
9819 *
9820 * Like String#strip, except that any modifications are made in +self+;
9821 * returns +self+ if any modification are made, +nil+ otherwise.
9822 *
9823 * Related: String#lstrip!, String#strip!.
9824 */
9825
9826static VALUE
9827rb_str_strip_bang(VALUE str)
9828{
9829 char *start;
9830 long olen, loffset, roffset;
9831 rb_encoding *enc;
9832
9833 str_modify_keep_cr(str);
9834 enc = STR_ENC_GET(str);
9835 RSTRING_GETMEM(str, start, olen);
9836 loffset = lstrip_offset(str, start, start+olen, enc);
9837 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9838
9839 if (loffset > 0 || roffset > 0) {
9840 long len = olen-roffset;
9841 if (loffset > 0) {
9842 len -= loffset;
9843 memmove(start, start + loffset, len);
9844 }
9845 STR_SET_LEN(str, len);
9846 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9847 return str;
9848 }
9849 return Qnil;
9850}
9851
9852
9853/*
9854 * call-seq:
9855 * strip -> new_string
9856 *
9857 * Returns a copy of the receiver with leading and trailing whitespace removed;
9858 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9859 *
9860 * whitespace = "\x00\t\n\v\f\r "
9861 * s = whitespace + 'abc' + whitespace
9862 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9863 * s.strip # => "abc"
9864 *
9865 * Related: String#lstrip, String#rstrip.
9866 */
9867
9868static VALUE
9869rb_str_strip(VALUE str)
9870{
9871 char *start;
9872 long olen, loffset, roffset;
9873 rb_encoding *enc = STR_ENC_GET(str);
9874
9875 RSTRING_GETMEM(str, start, olen);
9876 loffset = lstrip_offset(str, start, start+olen, enc);
9877 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9878
9879 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9880 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9881}
9882
9883static VALUE
9884scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9885{
9886 VALUE result, match;
9887 struct re_registers *regs;
9888 int i;
9889 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9890 if (pos >= 0) {
9891 if (BUILTIN_TYPE(pat) == T_STRING) {
9892 regs = NULL;
9893 end = pos + RSTRING_LEN(pat);
9894 }
9895 else {
9896 match = rb_backref_get();
9897 regs = RMATCH_REGS(match);
9898 pos = BEG(0);
9899 end = END(0);
9900 }
9901 if (pos == end) {
9902 rb_encoding *enc = STR_ENC_GET(str);
9903 /*
9904 * Always consume at least one character of the input string
9905 */
9906 if (RSTRING_LEN(str) > end)
9907 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9908 RSTRING_END(str), enc);
9909 else
9910 *start = end + 1;
9911 }
9912 else {
9913 *start = end;
9914 }
9915 if (!regs || regs->num_regs == 1) {
9916 result = rb_str_subseq(str, pos, end - pos);
9917 return result;
9918 }
9919 result = rb_ary_new2(regs->num_regs);
9920 for (i=1; i < regs->num_regs; i++) {
9921 VALUE s = Qnil;
9922 if (BEG(i) >= 0) {
9923 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9924 }
9925 rb_ary_push(result, s);
9926 }
9927
9928 return result;
9929 }
9930 return Qnil;
9931}
9932
9933
9934/*
9935 * call-seq:
9936 * scan(string_or_regexp) -> array
9937 * scan(string_or_regexp) {|matches| ... } -> self
9938 *
9939 * Matches a pattern against +self+; the pattern is:
9940 *
9941 * - +string_or_regexp+ itself, if it is a Regexp.
9942 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
9943 *
9944 * Iterates through +self+, generating a collection of matching results:
9945 *
9946 * - If the pattern contains no groups, each result is the
9947 * matched string, <code>$&</code>.
9948 * - If the pattern contains groups, each result is an array
9949 * containing one entry per group.
9950 *
9951 * With no block given, returns an array of the results:
9952 *
9953 * s = 'cruel world'
9954 * s.scan(/\w+/) # => ["cruel", "world"]
9955 * s.scan(/.../) # => ["cru", "el ", "wor"]
9956 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
9957 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
9958 *
9959 * With a block given, calls the block with each result; returns +self+:
9960 *
9961 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
9962 * print "\n"
9963 * s.scan(/(.)(.)/) {|x,y| print y, x }
9964 * print "\n"
9965 *
9966 * Output:
9967 *
9968 * <<cruel>> <<world>>
9969 * rceu lowlr
9970 *
9971 */
9972
9973static VALUE
9974rb_str_scan(VALUE str, VALUE pat)
9975{
9976 VALUE result;
9977 long start = 0;
9978 long last = -1, prev = 0;
9979 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9980
9981 pat = get_pat_quoted(pat, 1);
9982 mustnot_broken(str);
9983 if (!rb_block_given_p()) {
9984 VALUE ary = rb_ary_new();
9985
9986 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9987 last = prev;
9988 prev = start;
9989 rb_ary_push(ary, result);
9990 }
9991 if (last >= 0) rb_pat_search(pat, str, last, 1);
9992 else rb_backref_set(Qnil);
9993 return ary;
9994 }
9995
9996 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9997 last = prev;
9998 prev = start;
9999 rb_yield(result);
10000 str_mod_check(str, p, len);
10001 }
10002 if (last >= 0) rb_pat_search(pat, str, last, 1);
10003 return str;
10004}
10005
10006
10007/*
10008 * call-seq:
10009 * hex -> integer
10010 *
10011 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10012 * (with an optional sign and an optional <code>0x</code>) and returns the
10013 * corresponding number;
10014 * returns zero if there is no such leading substring:
10015 *
10016 * '0x0a'.hex # => 10
10017 * '-1234'.hex # => -4660
10018 * '0'.hex # => 0
10019 * 'non-numeric'.hex # => 0
10020 *
10021 * Related: String#oct.
10022 *
10023 */
10024
10025static VALUE
10026rb_str_hex(VALUE str)
10027{
10028 return rb_str_to_inum(str, 16, FALSE);
10029}
10030
10031
10032/*
10033 * call-seq:
10034 * oct -> integer
10035 *
10036 * Interprets the leading substring of +self+ as a string of octal digits
10037 * (with an optional sign) and returns the corresponding number;
10038 * returns zero if there is no such leading substring:
10039 *
10040 * '123'.oct # => 83
10041 * '-377'.oct # => -255
10042 * '0377non-numeric'.oct # => 255
10043 * 'non-numeric'.oct # => 0
10044 *
10045 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10046 * see Kernel#Integer.
10047 *
10048 * Related: String#hex.
10049 *
10050 */
10051
10052static VALUE
10053rb_str_oct(VALUE str)
10054{
10055 return rb_str_to_inum(str, -8, FALSE);
10056}
10057
10058#ifndef HAVE_CRYPT_R
10059# include "ruby/thread_native.h"
10060# include "ruby/atomic.h"
10061
10062static struct {
10064} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10065
10066static void
10067crypt_mutex_initialize(void)
10068{
10069}
10070#endif
10071
10072/*
10073 * call-seq:
10074 * crypt(salt_str) -> new_string
10075 *
10076 * Returns the string generated by calling <code>crypt(3)</code>
10077 * standard library function with <code>str</code> and
10078 * <code>salt_str</code>, in this order, as its arguments. Please do
10079 * not use this method any longer. It is legacy; provided only for
10080 * backward compatibility with ruby scripts in earlier days. It is
10081 * bad to use in contemporary programs for several reasons:
10082 *
10083 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10084 * run. The generated string lacks data portability.
10085 *
10086 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10087 * (i.e. silently ends up in unexpected results).
10088 *
10089 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10090 * thread safe.
10091 *
10092 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10093 * very very weak. According to its manpage, Linux's traditional
10094 * <code>crypt(3)</code> output has only 2**56 variations; too
10095 * easy to brute force today. And this is the default behaviour.
10096 *
10097 * * In order to make things robust some OSes implement so-called
10098 * "modular" usage. To go through, you have to do a complex
10099 * build-up of the <code>salt_str</code> parameter, by hand.
10100 * Failure in generation of a proper salt string tends not to
10101 * yield any errors; typos in parameters are normally not
10102 * detectable.
10103 *
10104 * * For instance, in the following example, the second invocation
10105 * of String#crypt is wrong; it has a typo in "round=" (lacks
10106 * "s"). However the call does not fail and something unexpected
10107 * is generated.
10108 *
10109 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10110 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10111 *
10112 * * Even in the "modular" mode, some hash functions are considered
10113 * archaic and no longer recommended at all; for instance module
10114 * <code>$1$</code> is officially abandoned by its author: see
10115 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10116 * instance module <code>$3$</code> is considered completely
10117 * broken: see the manpage of FreeBSD.
10118 *
10119 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10120 * written above, <code>crypt(3)</code> on Mac OS never fails.
10121 * This means even if you build up a proper salt string it
10122 * generates a traditional DES hash anyways, and there is no way
10123 * for you to be aware of.
10124 *
10125 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10126 *
10127 * If for some reason you cannot migrate to other secure contemporary
10128 * password hashing algorithms, install the string-crypt gem and
10129 * <code>require 'string/crypt'</code> to continue using it.
10130 */
10131
10132static VALUE
10133rb_str_crypt(VALUE str, VALUE salt)
10134{
10135#ifdef HAVE_CRYPT_R
10136 VALUE databuf;
10137 struct crypt_data *data;
10138# define CRYPT_END() ALLOCV_END(databuf)
10139#else
10140 extern char *crypt(const char *, const char *);
10141# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10142#endif
10143 VALUE result;
10144 const char *s, *saltp;
10145 char *res;
10146#ifdef BROKEN_CRYPT
10147 char salt_8bit_clean[3];
10148#endif
10149
10150 StringValue(salt);
10151 mustnot_wchar(str);
10152 mustnot_wchar(salt);
10153 s = StringValueCStr(str);
10154 saltp = RSTRING_PTR(salt);
10155 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10156 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10157 }
10158
10159#ifdef BROKEN_CRYPT
10160 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10161 salt_8bit_clean[0] = saltp[0] & 0x7f;
10162 salt_8bit_clean[1] = saltp[1] & 0x7f;
10163 salt_8bit_clean[2] = '\0';
10164 saltp = salt_8bit_clean;
10165 }
10166#endif
10167#ifdef HAVE_CRYPT_R
10168 data = ALLOCV(databuf, sizeof(struct crypt_data));
10169# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10170 data->initialized = 0;
10171# endif
10172 res = crypt_r(s, saltp, data);
10173#else
10174 crypt_mutex_initialize();
10175 rb_nativethread_lock_lock(&crypt_mutex.lock);
10176 res = crypt(s, saltp);
10177#endif
10178 if (!res) {
10179 int err = errno;
10180 CRYPT_END();
10181 rb_syserr_fail(err, "crypt");
10182 }
10183 result = rb_str_new_cstr(res);
10184 CRYPT_END();
10185 return result;
10186}
10187
10188
10189/*
10190 * call-seq:
10191 * ord -> integer
10192 *
10193 * :include: doc/string/ord.rdoc
10194 *
10195 */
10196
10197static VALUE
10198rb_str_ord(VALUE s)
10199{
10200 unsigned int c;
10201
10202 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10203 return UINT2NUM(c);
10204}
10205/*
10206 * call-seq:
10207 * sum(n = 16) -> integer
10208 *
10209 * :include: doc/string/sum.rdoc
10210 *
10211 */
10212
10213static VALUE
10214rb_str_sum(int argc, VALUE *argv, VALUE str)
10215{
10216 int bits = 16;
10217 char *ptr, *p, *pend;
10218 long len;
10219 VALUE sum = INT2FIX(0);
10220 unsigned long sum0 = 0;
10221
10222 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10223 bits = 0;
10224 }
10225 ptr = p = RSTRING_PTR(str);
10226 len = RSTRING_LEN(str);
10227 pend = p + len;
10228
10229 while (p < pend) {
10230 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10231 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10232 str_mod_check(str, ptr, len);
10233 sum0 = 0;
10234 }
10235 sum0 += (unsigned char)*p;
10236 p++;
10237 }
10238
10239 if (bits == 0) {
10240 if (sum0) {
10241 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10242 }
10243 }
10244 else {
10245 if (sum == INT2FIX(0)) {
10246 if (bits < (int)sizeof(long)*CHAR_BIT) {
10247 sum0 &= (((unsigned long)1)<<bits)-1;
10248 }
10249 sum = LONG2FIX(sum0);
10250 }
10251 else {
10252 VALUE mod;
10253
10254 if (sum0) {
10255 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10256 }
10257
10258 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10259 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10260 sum = rb_funcall(sum, '&', 1, mod);
10261 }
10262 }
10263 return sum;
10264}
10265
10266static VALUE
10267rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10268{
10269 rb_encoding *enc;
10270 VALUE w;
10271 long width, len, flen = 1, fclen = 1;
10272 VALUE res;
10273 char *p;
10274 const char *f = " ";
10275 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10276 VALUE pad;
10277 int singlebyte = 1, cr;
10278 int termlen;
10279
10280 rb_scan_args(argc, argv, "11", &w, &pad);
10281 enc = STR_ENC_GET(str);
10282 termlen = rb_enc_mbminlen(enc);
10283 width = NUM2LONG(w);
10284 if (argc == 2) {
10285 StringValue(pad);
10286 enc = rb_enc_check(str, pad);
10287 f = RSTRING_PTR(pad);
10288 flen = RSTRING_LEN(pad);
10289 fclen = str_strlen(pad, enc); /* rb_enc_check */
10290 singlebyte = single_byte_optimizable(pad);
10291 if (flen == 0 || fclen == 0) {
10292 rb_raise(rb_eArgError, "zero width padding");
10293 }
10294 }
10295 len = str_strlen(str, enc); /* rb_enc_check */
10296 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10297 n = width - len;
10298 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10299 rlen = n - llen;
10300 cr = ENC_CODERANGE(str);
10301 if (flen > 1) {
10302 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10303 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10304 }
10305 size = RSTRING_LEN(str);
10306 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10307 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10308 (len += llen2 + rlen2) >= LONG_MAX - size) {
10309 rb_raise(rb_eArgError, "argument too big");
10310 }
10311 len += size;
10312 res = str_new0(rb_cString, 0, len, termlen);
10313 p = RSTRING_PTR(res);
10314 if (flen <= 1) {
10315 memset(p, *f, llen);
10316 p += llen;
10317 }
10318 else {
10319 while (llen >= fclen) {
10320 memcpy(p,f,flen);
10321 p += flen;
10322 llen -= fclen;
10323 }
10324 if (llen > 0) {
10325 memcpy(p, f, llen2);
10326 p += llen2;
10327 }
10328 }
10329 memcpy(p, RSTRING_PTR(str), size);
10330 p += size;
10331 if (flen <= 1) {
10332 memset(p, *f, rlen);
10333 p += rlen;
10334 }
10335 else {
10336 while (rlen >= fclen) {
10337 memcpy(p,f,flen);
10338 p += flen;
10339 rlen -= fclen;
10340 }
10341 if (rlen > 0) {
10342 memcpy(p, f, rlen2);
10343 p += rlen2;
10344 }
10345 }
10346 TERM_FILL(p, termlen);
10347 STR_SET_LEN(res, p-RSTRING_PTR(res));
10348 rb_enc_associate(res, enc);
10349 if (argc == 2)
10350 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10351 if (cr != ENC_CODERANGE_BROKEN)
10352 ENC_CODERANGE_SET(res, cr);
10353
10354 RB_GC_GUARD(pad);
10355 return res;
10356}
10357
10358
10359/*
10360 * call-seq:
10361 * ljust(size, pad_string = ' ') -> new_string
10362 *
10363 * :include: doc/string/ljust.rdoc
10364 *
10365 * Related: String#rjust, String#center.
10366 *
10367 */
10368
10369static VALUE
10370rb_str_ljust(int argc, VALUE *argv, VALUE str)
10371{
10372 return rb_str_justify(argc, argv, str, 'l');
10373}
10374
10375/*
10376 * call-seq:
10377 * rjust(size, pad_string = ' ') -> new_string
10378 *
10379 * :include: doc/string/rjust.rdoc
10380 *
10381 * Related: String#ljust, String#center.
10382 *
10383 */
10384
10385static VALUE
10386rb_str_rjust(int argc, VALUE *argv, VALUE str)
10387{
10388 return rb_str_justify(argc, argv, str, 'r');
10389}
10390
10391
10392/*
10393 * call-seq:
10394 * center(size, pad_string = ' ') -> new_string
10395 *
10396 * :include: doc/string/center.rdoc
10397 *
10398 * Related: String#ljust, String#rjust.
10399 *
10400 */
10401
10402static VALUE
10403rb_str_center(int argc, VALUE *argv, VALUE str)
10404{
10405 return rb_str_justify(argc, argv, str, 'c');
10406}
10407
10408/*
10409 * call-seq:
10410 * partition(string_or_regexp) -> [head, match, tail]
10411 *
10412 * :include: doc/string/partition.rdoc
10413 *
10414 */
10415
10416static VALUE
10417rb_str_partition(VALUE str, VALUE sep)
10418{
10419 long pos;
10420
10421 sep = get_pat_quoted(sep, 0);
10422 if (RB_TYPE_P(sep, T_REGEXP)) {
10423 if (rb_reg_search(sep, str, 0, 0) < 0) {
10424 goto failed;
10425 }
10426 VALUE match = rb_backref_get();
10427 struct re_registers *regs = RMATCH_REGS(match);
10428
10429 pos = BEG(0);
10430 sep = rb_str_subseq(str, pos, END(0) - pos);
10431 }
10432 else {
10433 pos = rb_str_index(str, sep, 0);
10434 if (pos < 0) goto failed;
10435 }
10436 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10437 sep,
10438 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10439 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10440
10441 failed:
10442 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10443}
10444
10445/*
10446 * call-seq:
10447 * rpartition(sep) -> [head, match, tail]
10448 *
10449 * :include: doc/string/rpartition.rdoc
10450 *
10451 */
10452
10453static VALUE
10454rb_str_rpartition(VALUE str, VALUE sep)
10455{
10456 long pos = RSTRING_LEN(str);
10457
10458 sep = get_pat_quoted(sep, 0);
10459 if (RB_TYPE_P(sep, T_REGEXP)) {
10460 if (rb_reg_search(sep, str, pos, 1) < 0) {
10461 goto failed;
10462 }
10463 VALUE match = rb_backref_get();
10464 struct re_registers *regs = RMATCH_REGS(match);
10465
10466 pos = BEG(0);
10467 sep = rb_str_subseq(str, pos, END(0) - pos);
10468 }
10469 else {
10470 pos = rb_str_sublen(str, pos);
10471 pos = rb_str_rindex(str, sep, pos);
10472 if (pos < 0) {
10473 goto failed;
10474 }
10475 pos = rb_str_offset(str, pos);
10476 }
10477
10478 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10479 sep,
10480 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10481 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10482 failed:
10483 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10484}
10485
10486/*
10487 * call-seq:
10488 * start_with?(*string_or_regexp) -> true or false
10489 *
10490 * :include: doc/string/start_with_p.rdoc
10491 *
10492 */
10493
10494static VALUE
10495rb_str_start_with(int argc, VALUE *argv, VALUE str)
10496{
10497 int i;
10498
10499 for (i=0; i<argc; i++) {
10500 VALUE tmp = argv[i];
10501 if (RB_TYPE_P(tmp, T_REGEXP)) {
10502 if (rb_reg_start_with_p(tmp, str))
10503 return Qtrue;
10504 }
10505 else {
10506 StringValue(tmp);
10507 rb_enc_check(str, tmp);
10508 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10509 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10510 return Qtrue;
10511 }
10512 }
10513 return Qfalse;
10514}
10515
10516/*
10517 * call-seq:
10518 * end_with?(*strings) -> true or false
10519 *
10520 * :include: doc/string/end_with_p.rdoc
10521 *
10522 */
10523
10524static VALUE
10525rb_str_end_with(int argc, VALUE *argv, VALUE str)
10526{
10527 int i;
10528 char *p, *s, *e;
10529 rb_encoding *enc;
10530
10531 for (i=0; i<argc; i++) {
10532 VALUE tmp = argv[i];
10533 long slen, tlen;
10534 StringValue(tmp);
10535 enc = rb_enc_check(str, tmp);
10536 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10537 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10538 p = RSTRING_PTR(str);
10539 e = p + slen;
10540 s = e - tlen;
10541 if (rb_enc_left_char_head(p, s, e, enc) != s)
10542 continue;
10543 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10544 return Qtrue;
10545 }
10546 return Qfalse;
10547}
10548
10558static long
10559deleted_prefix_length(VALUE str, VALUE prefix)
10560{
10561 char *strptr, *prefixptr;
10562 long olen, prefixlen;
10563
10564 StringValue(prefix);
10565 if (is_broken_string(prefix)) return 0;
10566 rb_enc_check(str, prefix);
10567
10568 /* return 0 if not start with prefix */
10569 prefixlen = RSTRING_LEN(prefix);
10570 if (prefixlen <= 0) return 0;
10571 olen = RSTRING_LEN(str);
10572 if (olen < prefixlen) return 0;
10573 strptr = RSTRING_PTR(str);
10574 prefixptr = RSTRING_PTR(prefix);
10575 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10576
10577 return prefixlen;
10578}
10579
10580/*
10581 * call-seq:
10582 * delete_prefix!(prefix) -> self or nil
10583 *
10584 * Like String#delete_prefix, except that +self+ is modified in place.
10585 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10586 *
10587 */
10588
10589static VALUE
10590rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10591{
10592 long prefixlen;
10593 str_modify_keep_cr(str);
10594
10595 prefixlen = deleted_prefix_length(str, prefix);
10596 if (prefixlen <= 0) return Qnil;
10597
10598 return rb_str_drop_bytes(str, prefixlen);
10599}
10600
10601/*
10602 * call-seq:
10603 * delete_prefix(prefix) -> new_string
10604 *
10605 * :include: doc/string/delete_prefix.rdoc
10606 *
10607 */
10608
10609static VALUE
10610rb_str_delete_prefix(VALUE str, VALUE prefix)
10611{
10612 long prefixlen;
10613
10614 prefixlen = deleted_prefix_length(str, prefix);
10615 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10616
10617 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10618}
10619
10629static long
10630deleted_suffix_length(VALUE str, VALUE suffix)
10631{
10632 char *strptr, *suffixptr, *s;
10633 long olen, suffixlen;
10634 rb_encoding *enc;
10635
10636 StringValue(suffix);
10637 if (is_broken_string(suffix)) return 0;
10638 enc = rb_enc_check(str, suffix);
10639
10640 /* return 0 if not start with suffix */
10641 suffixlen = RSTRING_LEN(suffix);
10642 if (suffixlen <= 0) return 0;
10643 olen = RSTRING_LEN(str);
10644 if (olen < suffixlen) return 0;
10645 strptr = RSTRING_PTR(str);
10646 suffixptr = RSTRING_PTR(suffix);
10647 s = strptr + olen - suffixlen;
10648 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10649 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10650
10651 return suffixlen;
10652}
10653
10654/*
10655 * call-seq:
10656 * delete_suffix!(suffix) -> self or nil
10657 *
10658 * Like String#delete_suffix, except that +self+ is modified in place.
10659 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10660 *
10661 */
10662
10663static VALUE
10664rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10665{
10666 long olen, suffixlen, len;
10667 str_modifiable(str);
10668
10669 suffixlen = deleted_suffix_length(str, suffix);
10670 if (suffixlen <= 0) return Qnil;
10671
10672 olen = RSTRING_LEN(str);
10673 str_modify_keep_cr(str);
10674 len = olen - suffixlen;
10675 STR_SET_LEN(str, len);
10676 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10677 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10679 }
10680 return str;
10681}
10682
10683/*
10684 * call-seq:
10685 * delete_suffix(suffix) -> new_string
10686 *
10687 * :include: doc/string/delete_suffix.rdoc
10688 *
10689 */
10690
10691static VALUE
10692rb_str_delete_suffix(VALUE str, VALUE suffix)
10693{
10694 long suffixlen;
10695
10696 suffixlen = deleted_suffix_length(str, suffix);
10697 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10698
10699 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10700}
10701
10702void
10703rb_str_setter(VALUE val, ID id, VALUE *var)
10704{
10705 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10706 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10707 }
10708 *var = val;
10709}
10710
10711static void
10712rb_fs_setter(VALUE val, ID id, VALUE *var)
10713{
10714 val = rb_fs_check(val);
10715 if (!val) {
10717 "value of %"PRIsVALUE" must be String or Regexp",
10718 rb_id2str(id));
10719 }
10720 if (!NIL_P(val)) {
10721 rb_warn_deprecated("`$;'", NULL);
10722 }
10723 *var = val;
10724}
10725
10726
10727/*
10728 * call-seq:
10729 * force_encoding(encoding) -> self
10730 *
10731 * :include: doc/string/force_encoding.rdoc
10732 *
10733 */
10734
10735static VALUE
10736rb_str_force_encoding(VALUE str, VALUE enc)
10737{
10738 str_modifiable(str);
10739 rb_enc_associate(str, rb_to_encoding(enc));
10741 return str;
10742}
10743
10744/*
10745 * call-seq:
10746 * b -> string
10747 *
10748 * :include: doc/string/b.rdoc
10749 *
10750 */
10751
10752static VALUE
10753rb_str_b(VALUE str)
10754{
10755 VALUE str2;
10756 if (FL_TEST(str, STR_NOEMBED)) {
10757 str2 = str_alloc_heap(rb_cString);
10758 }
10759 else {
10760 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10761 }
10762 str_replace_shared_without_enc(str2, str);
10763
10764 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10765 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10766 // If we know the receiver's code range then we know the result's code range.
10767 int cr = ENC_CODERANGE(str);
10768 switch (cr) {
10769 case ENC_CODERANGE_7BIT:
10771 break;
10775 break;
10776 default:
10777 ENC_CODERANGE_CLEAR(str2);
10778 break;
10779 }
10780 }
10781
10782 return str2;
10783}
10784
10785/*
10786 * call-seq:
10787 * valid_encoding? -> true or false
10788 *
10789 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10790 *
10791 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10792 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10793 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10794 */
10795
10796static VALUE
10797rb_str_valid_encoding_p(VALUE str)
10798{
10799 int cr = rb_enc_str_coderange(str);
10800
10801 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10802}
10803
10804/*
10805 * call-seq:
10806 * ascii_only? -> true or false
10807 *
10808 * Returns +true+ if +self+ contains only ASCII characters,
10809 * +false+ otherwise:
10810 *
10811 * 'abc'.ascii_only? # => true
10812 * "abc\u{6666}".ascii_only? # => false
10813 *
10814 */
10815
10816static VALUE
10817rb_str_is_ascii_only_p(VALUE str)
10818{
10819 int cr = rb_enc_str_coderange(str);
10820
10821 return RBOOL(cr == ENC_CODERANGE_7BIT);
10822}
10823
10824VALUE
10826{
10827 static const char ellipsis[] = "...";
10828 const long ellipsislen = sizeof(ellipsis) - 1;
10829 rb_encoding *const enc = rb_enc_get(str);
10830 const long blen = RSTRING_LEN(str);
10831 const char *const p = RSTRING_PTR(str), *e = p + blen;
10832 VALUE estr, ret = 0;
10833
10834 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10835 if (len * rb_enc_mbminlen(enc) >= blen ||
10836 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10837 ret = str;
10838 }
10839 else if (len <= ellipsislen ||
10840 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10841 if (rb_enc_asciicompat(enc)) {
10842 ret = rb_str_new(ellipsis, len);
10843 rb_enc_associate(ret, enc);
10844 }
10845 else {
10846 estr = rb_usascii_str_new(ellipsis, len);
10847 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10848 }
10849 }
10850 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10851 rb_str_cat(ret, ellipsis, ellipsislen);
10852 }
10853 else {
10854 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10855 rb_enc_from_encoding(enc), 0, Qnil);
10856 rb_str_append(ret, estr);
10857 }
10858 return ret;
10859}
10860
10861static VALUE
10862str_compat_and_valid(VALUE str, rb_encoding *enc)
10863{
10864 int cr;
10865 str = StringValue(str);
10866 cr = rb_enc_str_coderange(str);
10867 if (cr == ENC_CODERANGE_BROKEN) {
10868 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10869 }
10870 else {
10871 rb_encoding *e = STR_ENC_GET(str);
10872 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10873 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10874 rb_enc_name(enc), rb_enc_name(e));
10875 }
10876 }
10877 return str;
10878}
10879
10880static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10881
10882VALUE
10884{
10885 rb_encoding *enc = STR_ENC_GET(str);
10886 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10887}
10888
10889VALUE
10890rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10891{
10892 int cr = ENC_CODERANGE_UNKNOWN;
10893 if (enc == STR_ENC_GET(str)) {
10894 /* cached coderange makes sense only when enc equals the
10895 * actual encoding of str */
10896 cr = ENC_CODERANGE(str);
10897 }
10898 return enc_str_scrub(enc, str, repl, cr);
10899}
10900
10901static VALUE
10902enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10903{
10904 int encidx;
10905 VALUE buf = Qnil;
10906 const char *rep, *p, *e, *p1, *sp;
10907 long replen = -1;
10908 long slen;
10909
10910 if (rb_block_given_p()) {
10911 if (!NIL_P(repl))
10912 rb_raise(rb_eArgError, "both of block and replacement given");
10913 replen = 0;
10914 }
10915
10916 if (ENC_CODERANGE_CLEAN_P(cr))
10917 return Qnil;
10918
10919 if (!NIL_P(repl)) {
10920 repl = str_compat_and_valid(repl, enc);
10921 }
10922
10923 if (rb_enc_dummy_p(enc)) {
10924 return Qnil;
10925 }
10926 encidx = rb_enc_to_index(enc);
10927
10928#define DEFAULT_REPLACE_CHAR(str) do { \
10929 static const char replace[sizeof(str)-1] = str; \
10930 rep = replace; replen = (int)sizeof(replace); \
10931 } while (0)
10932
10933 slen = RSTRING_LEN(str);
10934 p = RSTRING_PTR(str);
10935 e = RSTRING_END(str);
10936 p1 = p;
10937 sp = p;
10938
10939 if (rb_enc_asciicompat(enc)) {
10940 int rep7bit_p;
10941 if (!replen) {
10942 rep = NULL;
10943 rep7bit_p = FALSE;
10944 }
10945 else if (!NIL_P(repl)) {
10946 rep = RSTRING_PTR(repl);
10947 replen = RSTRING_LEN(repl);
10948 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10949 }
10950 else if (encidx == rb_utf8_encindex()) {
10951 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10952 rep7bit_p = FALSE;
10953 }
10954 else {
10955 DEFAULT_REPLACE_CHAR("?");
10956 rep7bit_p = TRUE;
10957 }
10958 cr = ENC_CODERANGE_7BIT;
10959
10960 p = search_nonascii(p, e);
10961 if (!p) {
10962 p = e;
10963 }
10964 while (p < e) {
10965 int ret = rb_enc_precise_mbclen(p, e, enc);
10966 if (MBCLEN_NEEDMORE_P(ret)) {
10967 break;
10968 }
10969 else if (MBCLEN_CHARFOUND_P(ret)) {
10971 p += MBCLEN_CHARFOUND_LEN(ret);
10972 }
10973 else if (MBCLEN_INVALID_P(ret)) {
10974 /*
10975 * p1~p: valid ascii/multibyte chars
10976 * p ~e: invalid bytes + unknown bytes
10977 */
10978 long clen = rb_enc_mbmaxlen(enc);
10979 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10980 if (p > p1) {
10981 rb_str_buf_cat(buf, p1, p - p1);
10982 }
10983
10984 if (e - p < clen) clen = e - p;
10985 if (clen <= 2) {
10986 clen = 1;
10987 }
10988 else {
10989 const char *q = p;
10990 clen--;
10991 for (; clen > 1; clen--) {
10992 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10993 if (MBCLEN_NEEDMORE_P(ret)) break;
10994 if (MBCLEN_INVALID_P(ret)) continue;
10996 }
10997 }
10998 if (rep) {
10999 rb_str_buf_cat(buf, rep, replen);
11000 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11001 }
11002 else {
11003 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11004 str_mod_check(str, sp, slen);
11005 repl = str_compat_and_valid(repl, enc);
11006 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11009 }
11010 p += clen;
11011 p1 = p;
11012 p = search_nonascii(p, e);
11013 if (!p) {
11014 p = e;
11015 break;
11016 }
11017 }
11018 else {
11020 }
11021 }
11022 if (NIL_P(buf)) {
11023 if (p == e) {
11024 ENC_CODERANGE_SET(str, cr);
11025 return Qnil;
11026 }
11027 buf = rb_str_buf_new(RSTRING_LEN(str));
11028 }
11029 if (p1 < p) {
11030 rb_str_buf_cat(buf, p1, p - p1);
11031 }
11032 if (p < e) {
11033 if (rep) {
11034 rb_str_buf_cat(buf, rep, replen);
11035 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11036 }
11037 else {
11038 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11039 str_mod_check(str, sp, slen);
11040 repl = str_compat_and_valid(repl, enc);
11041 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11044 }
11045 }
11046 }
11047 else {
11048 /* ASCII incompatible */
11049 long mbminlen = rb_enc_mbminlen(enc);
11050 if (!replen) {
11051 rep = NULL;
11052 }
11053 else if (!NIL_P(repl)) {
11054 rep = RSTRING_PTR(repl);
11055 replen = RSTRING_LEN(repl);
11056 }
11057 else if (encidx == ENCINDEX_UTF_16BE) {
11058 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11059 }
11060 else if (encidx == ENCINDEX_UTF_16LE) {
11061 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11062 }
11063 else if (encidx == ENCINDEX_UTF_32BE) {
11064 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11065 }
11066 else if (encidx == ENCINDEX_UTF_32LE) {
11067 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11068 }
11069 else {
11070 DEFAULT_REPLACE_CHAR("?");
11071 }
11072
11073 while (p < e) {
11074 int ret = rb_enc_precise_mbclen(p, e, enc);
11075 if (MBCLEN_NEEDMORE_P(ret)) {
11076 break;
11077 }
11078 else if (MBCLEN_CHARFOUND_P(ret)) {
11079 p += MBCLEN_CHARFOUND_LEN(ret);
11080 }
11081 else if (MBCLEN_INVALID_P(ret)) {
11082 const char *q = p;
11083 long clen = rb_enc_mbmaxlen(enc);
11084 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11085 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11086
11087 if (e - p < clen) clen = e - p;
11088 if (clen <= mbminlen * 2) {
11089 clen = mbminlen;
11090 }
11091 else {
11092 clen -= mbminlen;
11093 for (; clen > mbminlen; clen-=mbminlen) {
11094 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11095 if (MBCLEN_NEEDMORE_P(ret)) break;
11096 if (MBCLEN_INVALID_P(ret)) continue;
11098 }
11099 }
11100 if (rep) {
11101 rb_str_buf_cat(buf, rep, replen);
11102 }
11103 else {
11104 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11105 str_mod_check(str, sp, slen);
11106 repl = str_compat_and_valid(repl, enc);
11107 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11108 }
11109 p += clen;
11110 p1 = p;
11111 }
11112 else {
11114 }
11115 }
11116 if (NIL_P(buf)) {
11117 if (p == e) {
11119 return Qnil;
11120 }
11121 buf = rb_str_buf_new(RSTRING_LEN(str));
11122 }
11123 if (p1 < p) {
11124 rb_str_buf_cat(buf, p1, p - p1);
11125 }
11126 if (p < e) {
11127 if (rep) {
11128 rb_str_buf_cat(buf, rep, replen);
11129 }
11130 else {
11131 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11132 str_mod_check(str, sp, slen);
11133 repl = str_compat_and_valid(repl, enc);
11134 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11135 }
11136 }
11138 }
11139 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11140 return buf;
11141}
11142
11143/*
11144 * call-seq:
11145 * scrub(replacement_string = default_replacement) -> new_string
11146 * scrub{|bytes| ... } -> new_string
11147 *
11148 * :include: doc/string/scrub.rdoc
11149 *
11150 */
11151static VALUE
11152str_scrub(int argc, VALUE *argv, VALUE str)
11153{
11154 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11155 VALUE new = rb_str_scrub(str, repl);
11156 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11157}
11158
11159/*
11160 * call-seq:
11161 * scrub! -> self
11162 * scrub!(replacement_string = default_replacement) -> self
11163 * scrub!{|bytes| ... } -> self
11164 *
11165 * Like String#scrub, except that any replacements are made in +self+.
11166 *
11167 */
11168static VALUE
11169str_scrub_bang(int argc, VALUE *argv, VALUE str)
11170{
11171 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11172 VALUE new = rb_str_scrub(str, repl);
11173 if (!NIL_P(new)) rb_str_replace(str, new);
11174 return str;
11175}
11176
11177static ID id_normalize;
11178static ID id_normalized_p;
11179static VALUE mUnicodeNormalize;
11180
11181static VALUE
11182unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11183{
11184 static int UnicodeNormalizeRequired = 0;
11185 VALUE argv2[2];
11186
11187 if (!UnicodeNormalizeRequired) {
11188 rb_require("unicode_normalize/normalize.rb");
11189 UnicodeNormalizeRequired = 1;
11190 }
11191 argv2[0] = str;
11192 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11193 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11194}
11195
11196/*
11197 * call-seq:
11198 * unicode_normalize(form = :nfc) -> string
11199 *
11200 * Returns a copy of +self+ with
11201 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11202 *
11203 * Argument +form+ must be one of the following symbols
11204 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11205 *
11206 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11207 * - +:nfd+: Canonical decomposition.
11208 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11209 * - +:nfkd+: Compatibility decomposition.
11210 *
11211 * The encoding of +self+ must be one of:
11212 *
11213 * - Encoding::UTF_8
11214 * - Encoding::UTF_16BE
11215 * - Encoding::UTF_16LE
11216 * - Encoding::UTF_32BE
11217 * - Encoding::UTF_32LE
11218 * - Encoding::GB18030
11219 * - Encoding::UCS_2BE
11220 * - Encoding::UCS_4BE
11221 *
11222 * Examples:
11223 *
11224 * "a\u0300".unicode_normalize # => "a"
11225 * "\u00E0".unicode_normalize(:nfd) # => "a "
11226 *
11227 * Related: String#unicode_normalize!, String#unicode_normalized?.
11228 */
11229static VALUE
11230rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11231{
11232 return unicode_normalize_common(argc, argv, str, id_normalize);
11233}
11234
11235/*
11236 * call-seq:
11237 * unicode_normalize!(form = :nfc) -> self
11238 *
11239 * Like String#unicode_normalize, except that the normalization
11240 * is performed on +self+.
11241 *
11242 * Related String#unicode_normalized?.
11243 *
11244 */
11245static VALUE
11246rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11247{
11248 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11249}
11250
11251/* call-seq:
11252 * unicode_normalized?(form = :nfc) -> true or false
11253 *
11254 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11255 * +false+ otherwise.
11256 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11257 *
11258 * Examples:
11259 *
11260 * "a\u0300".unicode_normalized? # => false
11261 * "a\u0300".unicode_normalized?(:nfd) # => true
11262 * "\u00E0".unicode_normalized? # => true
11263 * "\u00E0".unicode_normalized?(:nfd) # => false
11264 *
11265 *
11266 * Raises an exception if +self+ is not in a Unicode encoding:
11267 *
11268 * s = "\xE0".force_encoding('ISO-8859-1')
11269 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11270 *
11271 * Related: String#unicode_normalize, String#unicode_normalize!.
11272 *
11273 */
11274static VALUE
11275rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11276{
11277 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11278}
11279
11280/**********************************************************************
11281 * Document-class: Symbol
11282 *
11283 * Symbol objects represent named identifiers inside the Ruby interpreter.
11284 *
11285 * You can create a \Symbol object explicitly with:
11286 *
11287 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11288 *
11289 * The same Symbol object will be
11290 * created for a given name or string for the duration of a program's
11291 * execution, regardless of the context or meaning of that name. Thus
11292 * if <code>Fred</code> is a constant in one context, a method in
11293 * another, and a class in a third, the Symbol <code>:Fred</code>
11294 * will be the same object in all three contexts.
11295 *
11296 * module One
11297 * class Fred
11298 * end
11299 * $f1 = :Fred
11300 * end
11301 * module Two
11302 * Fred = 1
11303 * $f2 = :Fred
11304 * end
11305 * def Fred()
11306 * end
11307 * $f3 = :Fred
11308 * $f1.object_id #=> 2514190
11309 * $f2.object_id #=> 2514190
11310 * $f3.object_id #=> 2514190
11311 *
11312 * Constant, method, and variable names are returned as symbols:
11313 *
11314 * module One
11315 * Two = 2
11316 * def three; 3 end
11317 * @four = 4
11318 * @@five = 5
11319 * $six = 6
11320 * end
11321 * seven = 7
11322 *
11323 * One.constants
11324 * # => [:Two]
11325 * One.instance_methods(true)
11326 * # => [:three]
11327 * One.instance_variables
11328 * # => [:@four]
11329 * One.class_variables
11330 * # => [:@@five]
11331 * global_variables.grep(/six/)
11332 * # => [:$six]
11333 * local_variables
11334 * # => [:seven]
11335 *
11336 * Symbol objects are different from String objects in that
11337 * Symbol objects represent identifiers, while String objects
11338 * represent text or data.
11339 *
11340 * == What's Here
11341 *
11342 * First, what's elsewhere. \Class \Symbol:
11343 *
11344 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11345 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11346 *
11347 * Here, class \Symbol provides methods that are useful for:
11348 *
11349 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11350 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11351 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11352 *
11353 * === Methods for Querying
11354 *
11355 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11356 * - #=~: Returns the index of the first substring in symbol that matches a
11357 * given Regexp or other object; returns +nil+ if no match is found.
11358 * - #[], #slice : Returns a substring of symbol
11359 * determined by a given index, start/length, or range, or string.
11360 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11361 * - #encoding: Returns the Encoding object that represents the encoding
11362 * of symbol.
11363 * - #end_with?: Returns +true+ if symbol ends with
11364 * any of the given strings.
11365 * - #match: Returns a MatchData object if symbol
11366 * matches a given Regexp; +nil+ otherwise.
11367 * - #match?: Returns +true+ if symbol
11368 * matches a given Regexp; +false+ otherwise.
11369 * - #length, #size: Returns the number of characters in symbol.
11370 * - #start_with?: Returns +true+ if symbol starts with
11371 * any of the given strings.
11372 *
11373 * === Methods for Comparing
11374 *
11375 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11376 * or larger than symbol.
11377 * - #==, #===: Returns +true+ if a given symbol has the same content and
11378 * encoding.
11379 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11380 * symbol is smaller than, equal to, or larger than symbol.
11381 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11382 * after Unicode case folding; +false+ otherwise.
11383 *
11384 * === Methods for Converting
11385 *
11386 * - #capitalize: Returns symbol with the first character upcased
11387 * and all other characters downcased.
11388 * - #downcase: Returns symbol with all characters downcased.
11389 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11390 * - #name: Returns the frozen string corresponding to symbol.
11391 * - #succ, #next: Returns the symbol that is the successor to symbol.
11392 * - #swapcase: Returns symbol with all upcase characters downcased
11393 * and all downcase characters upcased.
11394 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11395 * - #to_s, #id2name: Returns the string corresponding to +self+.
11396 * - #to_sym, #intern: Returns +self+.
11397 * - #upcase: Returns symbol with all characters upcased.
11398 *
11399 */
11400
11401
11402/*
11403 * call-seq:
11404 * symbol == object -> true or false
11405 *
11406 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11407 *
11408 * Symbol#=== is an alias for Symbol#==.
11409 *
11410 */
11411
11412#define sym_equal rb_obj_equal
11413
11414static int
11415sym_printable(const char *s, const char *send, rb_encoding *enc)
11416{
11417 while (s < send) {
11418 int n;
11419 int c = rb_enc_precise_mbclen(s, send, enc);
11420
11421 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11422 n = MBCLEN_CHARFOUND_LEN(c);
11423 c = rb_enc_mbc_to_codepoint(s, send, enc);
11424 if (!rb_enc_isprint(c, enc)) return FALSE;
11425 s += n;
11426 }
11427 return TRUE;
11428}
11429
11430int
11431rb_str_symname_p(VALUE sym)
11432{
11433 rb_encoding *enc;
11434 const char *ptr;
11435 long len;
11436 rb_encoding *resenc = rb_default_internal_encoding();
11437
11438 if (resenc == NULL) resenc = rb_default_external_encoding();
11439 enc = STR_ENC_GET(sym);
11440 ptr = RSTRING_PTR(sym);
11441 len = RSTRING_LEN(sym);
11442 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11443 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11444 return FALSE;
11445 }
11446 return TRUE;
11447}
11448
11449VALUE
11450rb_str_quote_unprintable(VALUE str)
11451{
11452 rb_encoding *enc;
11453 const char *ptr;
11454 long len;
11455 rb_encoding *resenc;
11456
11457 Check_Type(str, T_STRING);
11458 resenc = rb_default_internal_encoding();
11459 if (resenc == NULL) resenc = rb_default_external_encoding();
11460 enc = STR_ENC_GET(str);
11461 ptr = RSTRING_PTR(str);
11462 len = RSTRING_LEN(str);
11463 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11464 !sym_printable(ptr, ptr + len, enc)) {
11465 return rb_str_escape(str);
11466 }
11467 return str;
11468}
11469
11470MJIT_FUNC_EXPORTED VALUE
11471rb_id_quote_unprintable(ID id)
11472{
11473 VALUE str = rb_id2str(id);
11474 if (!rb_str_symname_p(str)) {
11475 return rb_str_escape(str);
11476 }
11477 return str;
11478}
11479
11480/*
11481 * call-seq:
11482 * inspect -> string
11483 *
11484 * Returns a string representation of +self+ (including the leading colon):
11485 *
11486 * :foo.inspect # => ":foo"
11487 *
11488 * Related: Symbol#to_s, Symbol#name.
11489 *
11490 */
11491
11492static VALUE
11493sym_inspect(VALUE sym)
11494{
11495 VALUE str = rb_sym2str(sym);
11496 const char *ptr;
11497 long len;
11498 char *dest;
11499
11500 if (!rb_str_symname_p(str)) {
11501 str = rb_str_inspect(str);
11502 len = RSTRING_LEN(str);
11503 rb_str_resize(str, len + 1);
11504 dest = RSTRING_PTR(str);
11505 memmove(dest + 1, dest, len);
11506 }
11507 else {
11508 rb_encoding *enc = STR_ENC_GET(str);
11509 RSTRING_GETMEM(str, ptr, len);
11510 str = rb_enc_str_new(0, len + 1, enc);
11511 dest = RSTRING_PTR(str);
11512 memcpy(dest + 1, ptr, len);
11513 }
11514 dest[0] = ':';
11515 return str;
11516}
11517
11518/*
11519 * call-seq:
11520 * to_s -> string
11521 *
11522 * Returns a string representation of +self+ (not including the leading colon):
11523 *
11524 * :foo.to_s # => "foo"
11525 *
11526 * Symbol#id2name is an alias for Symbol#to_s.
11527 *
11528 * Related: Symbol#inspect, Symbol#name.
11529 */
11530
11531VALUE
11533{
11534 return str_new_shared(rb_cString, rb_sym2str(sym));
11535}
11536
11537MJIT_FUNC_EXPORTED VALUE
11538rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11539{
11540 VALUE obj;
11541
11542 if (argc < 1) {
11543 rb_raise(rb_eArgError, "no receiver given");
11544 }
11545 obj = argv[0];
11546 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11547}
11548
11549/*
11550 * call-seq:
11551 * succ
11552 *
11553 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11554 *
11555 * :foo.succ # => :fop
11556 *
11557 * Symbol#next is an alias for Symbol#succ.
11558 *
11559 * Related: String#succ.
11560 */
11561
11562static VALUE
11563sym_succ(VALUE sym)
11564{
11565 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11566}
11567
11568/*
11569 * call-seq:
11570 * symbol <=> object -> -1, 0, +1, or nil
11571 *
11572 * If +object+ is a symbol,
11573 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11574 *
11575 * :bar <=> :foo # => -1
11576 * :foo <=> :foo # => 0
11577 * :foo <=> :bar # => 1
11578 *
11579 * Otherwise, returns +nil+:
11580 *
11581 * :foo <=> 'bar' # => nil
11582 *
11583 * Related: String#<=>.
11584 */
11585
11586static VALUE
11587sym_cmp(VALUE sym, VALUE other)
11588{
11589 if (!SYMBOL_P(other)) {
11590 return Qnil;
11591 }
11592 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11593}
11594
11595/*
11596 * call-seq:
11597 * casecmp(object) -> -1, 0, 1, or nil
11598 *
11599 * :include: doc/symbol/casecmp.rdoc
11600 *
11601 */
11602
11603static VALUE
11604sym_casecmp(VALUE sym, VALUE other)
11605{
11606 if (!SYMBOL_P(other)) {
11607 return Qnil;
11608 }
11609 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11610}
11611
11612/*
11613 * call-seq:
11614 * casecmp?(object) -> true, false, or nil
11615 *
11616 * :include: doc/symbol/casecmp_p.rdoc
11617 *
11618 */
11619
11620static VALUE
11621sym_casecmp_p(VALUE sym, VALUE other)
11622{
11623 if (!SYMBOL_P(other)) {
11624 return Qnil;
11625 }
11626 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11627}
11628
11629/*
11630 * call-seq:
11631 * symbol =~ object -> integer or nil
11632 *
11633 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11634 * including possible updates to global variables;
11635 * see String#=~.
11636 *
11637 */
11638
11639static VALUE
11640sym_match(VALUE sym, VALUE other)
11641{
11642 return rb_str_match(rb_sym2str(sym), other);
11643}
11644
11645/*
11646 * call-seq:
11647 * match(pattern, offset = 0) -> matchdata or nil
11648 * match(pattern, offset = 0) {|matchdata| } -> object
11649 *
11650 * Equivalent to <tt>self.to_s.match</tt>,
11651 * including possible updates to global variables;
11652 * see String#match.
11653 *
11654 */
11655
11656static VALUE
11657sym_match_m(int argc, VALUE *argv, VALUE sym)
11658{
11659 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11660}
11661
11662/*
11663 * call-seq:
11664 * match?(pattern, offset) -> true or false
11665 *
11666 * Equivalent to <tt>sym.to_s.match?</tt>;
11667 * see String#match.
11668 *
11669 */
11670
11671static VALUE
11672sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11673{
11674 return rb_str_match_m_p(argc, argv, sym);
11675}
11676
11677/*
11678 * call-seq:
11679 * symbol[index] -> string or nil
11680 * symbol[start, length] -> string or nil
11681 * symbol[range] -> string or nil
11682 * symbol[regexp, capture = 0] -> string or nil
11683 * symbol[substring] -> string or nil
11684 *
11685 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11686 *
11687 */
11688
11689static VALUE
11690sym_aref(int argc, VALUE *argv, VALUE sym)
11691{
11692 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11693}
11694
11695/*
11696 * call-seq:
11697 * length -> integer
11698 *
11699 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11700 *
11701 * Symbol#size is an alias for Symbol#length.
11702 *
11703 */
11704
11705static VALUE
11706sym_length(VALUE sym)
11707{
11708 return rb_str_length(rb_sym2str(sym));
11709}
11710
11711/*
11712 * call-seq:
11713 * empty? -> true or false
11714 *
11715 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11716 *
11717 */
11718
11719static VALUE
11720sym_empty(VALUE sym)
11721{
11722 return rb_str_empty(rb_sym2str(sym));
11723}
11724
11725/*
11726 * call-seq:
11727 * upcase(*options) -> symbol
11728 *
11729 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11730 *
11731 * See String#upcase.
11732 *
11733 */
11734
11735static VALUE
11736sym_upcase(int argc, VALUE *argv, VALUE sym)
11737{
11738 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11739}
11740
11741/*
11742 * call-seq:
11743 * downcase(*options) -> symbol
11744 *
11745 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11746 *
11747 * See String#downcase.
11748 *
11749 * Related: Symbol#upcase.
11750 *
11751 */
11752
11753static VALUE
11754sym_downcase(int argc, VALUE *argv, VALUE sym)
11755{
11756 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11757}
11758
11759/*
11760 * call-seq:
11761 * capitalize(*options) -> symbol
11762 *
11763 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11764 *
11765 * See String#capitalize.
11766 *
11767 */
11768
11769static VALUE
11770sym_capitalize(int argc, VALUE *argv, VALUE sym)
11771{
11772 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11773}
11774
11775/*
11776 * call-seq:
11777 * swapcase(*options) -> symbol
11778 *
11779 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11780 *
11781 * See String#swapcase.
11782 *
11783 */
11784
11785static VALUE
11786sym_swapcase(int argc, VALUE *argv, VALUE sym)
11787{
11788 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11789}
11790
11791/*
11792 * call-seq:
11793 * start_with?(*string_or_regexp) -> true or false
11794 *
11795 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11796 *
11797 */
11798
11799static VALUE
11800sym_start_with(int argc, VALUE *argv, VALUE sym)
11801{
11802 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11803}
11804
11805/*
11806 * call-seq:
11807 * end_with?(*string_or_regexp) -> true or false
11808 *
11809 *
11810 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11811 *
11812 */
11813
11814static VALUE
11815sym_end_with(int argc, VALUE *argv, VALUE sym)
11816{
11817 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11818}
11819
11820/*
11821 * call-seq:
11822 * encoding -> encoding
11823 *
11824 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
11825 *
11826 */
11827
11828static VALUE
11829sym_encoding(VALUE sym)
11830{
11831 return rb_obj_encoding(rb_sym2str(sym));
11832}
11833
11834static VALUE
11835string_for_symbol(VALUE name)
11836{
11837 if (!RB_TYPE_P(name, T_STRING)) {
11838 VALUE tmp = rb_check_string_type(name);
11839 if (NIL_P(tmp)) {
11840 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11841 name);
11842 }
11843 name = tmp;
11844 }
11845 return name;
11846}
11847
11848ID
11850{
11851 if (SYMBOL_P(name)) {
11852 return SYM2ID(name);
11853 }
11854 name = string_for_symbol(name);
11855 return rb_intern_str(name);
11856}
11857
11858VALUE
11860{
11861 if (SYMBOL_P(name)) {
11862 return name;
11863 }
11864 name = string_for_symbol(name);
11865 return rb_str_intern(name);
11866}
11867
11868/*
11869 * call-seq:
11870 * Symbol.all_symbols -> array_of_symbols
11871 *
11872 * Returns an array of all symbols currently in Ruby's symbol table:
11873 *
11874 * Symbol.all_symbols.size # => 9334
11875 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
11876 *
11877 */
11878
11879static VALUE
11880sym_all_symbols(VALUE _)
11881{
11882 return rb_sym_all_symbols();
11883}
11884
11885VALUE
11887{
11888 return rb_fstring(str);
11889}
11890
11891VALUE
11892rb_interned_str(const char *ptr, long len)
11893{
11894 struct RString fake_str;
11895 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11896}
11897
11898VALUE
11900{
11901 return rb_interned_str(ptr, strlen(ptr));
11902}
11903
11904VALUE
11905rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11906{
11907 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11908 rb_enc_autoload(enc);
11909 }
11910
11911 struct RString fake_str;
11912 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11913}
11914
11915VALUE
11917{
11918 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11919}
11920
11921void
11922Init_String(void)
11923{
11925 assert(rb_vm_fstring_table());
11926 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11928 rb_define_alloc_func(rb_cString, empty_str_alloc);
11929 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11930 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11931 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11932 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11933 rb_define_method(rb_cString, "==", rb_str_equal, 1);
11934 rb_define_method(rb_cString, "===", rb_str_equal, 1);
11935 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11936 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11937 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11938 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11939 rb_define_method(rb_cString, "+", rb_str_plus, 1);
11940 rb_define_method(rb_cString, "*", rb_str_times, 1);
11941 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11942 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11943 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11944 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11945 rb_define_method(rb_cString, "length", rb_str_length, 0);
11946 rb_define_method(rb_cString, "size", rb_str_length, 0);
11947 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11948 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11949 rb_define_method(rb_cString, "=~", rb_str_match, 1);
11950 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11951 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11952 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
11953 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11954 rb_define_method(rb_cString, "next", rb_str_succ, 0);
11955 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11956 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11957 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11958 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
11959 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11960 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
11961 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
11962 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11963 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11964 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11965 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11966 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11967 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
11968 rb_define_method(rb_cString, "scrub", str_scrub, -1);
11969 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11970 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
11971 rb_define_method(rb_cString, "+@", str_uplus, 0);
11972 rb_define_method(rb_cString, "-@", str_uminus, 0);
11973 rb_define_alias(rb_cString, "dedup", "-@");
11974
11975 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11976 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11977 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11978 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11979 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
11980 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
11981 rb_define_method(rb_cString, "undump", str_undump, 0);
11982
11983 sym_ascii = ID2SYM(rb_intern_const("ascii"));
11984 sym_turkic = ID2SYM(rb_intern_const("turkic"));
11985 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
11986 sym_fold = ID2SYM(rb_intern_const("fold"));
11987
11988 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11989 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11990 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11991 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11992
11993 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11994 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11995 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11996 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11997
11998 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
11999 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12000 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12001 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12002 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12003 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12004 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12005 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12006 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12007 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12008 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12009 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12010 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12011 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12012 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12013 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12014 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12015
12016 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12017 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12018 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12019
12020 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12021
12022 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12023 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12024 rb_define_method(rb_cString, "center", rb_str_center, -1);
12025
12026 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12027 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12028 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12029 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12030 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12031 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12032 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12033 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12034 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12035
12036 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12037 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12038 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12039 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12040 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12041 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12042 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12043 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12044 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12045
12046 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12047 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12048 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12049 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12050 rb_define_method(rb_cString, "count", rb_str_count, -1);
12051
12052 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12053 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12054 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12055 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12056
12057 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12058 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12059 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12060 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12061 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12062
12063 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12064
12065 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12066 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12067
12068 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12069 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12070
12071 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12072 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12073 rb_define_method(rb_cString, "b", rb_str_b, 0);
12074 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12075 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12076
12077 /* define UnicodeNormalize module here so that we don't have to look it up */
12078 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12079 id_normalize = rb_intern_const("normalize");
12080 id_normalized_p = rb_intern_const("normalized?");
12081
12082 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12083 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12084 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12085
12086 rb_fs = Qnil;
12087 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12088 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12089 rb_gc_register_address(&rb_fs);
12090
12095 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12096
12097 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12098 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12099 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12100 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12101 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12102 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12103 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12104 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12105 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12106
12107 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12108 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12109 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12110 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12111
12112 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12113 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12114 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12115 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12116 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12117 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12118 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12119
12120 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12121 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12122 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12123 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12124
12125 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12126 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12127
12128 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12129}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition: assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition: assert.h:167
Atomic operations.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition: ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
Definition: cxxanyargs.hpp:670
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1200
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition: fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition: class.c:1090
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:888
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition: class.c:998
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:2249
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:2073
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:864
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition: class.c:2328
#define TYPE(_)
Old name of rb_type.
Definition: value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition: value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition: fl_type.h:142
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition: string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition: fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition: memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition: ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition: coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition: coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:145
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition: string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition: assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition: fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition: value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition: symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition: coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition: size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition: fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition: encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition: long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition: coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition: memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition: fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition: fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition: array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition: long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition: ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition: encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition: st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition: encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition: fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition: long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition: util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition: fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition: double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition: ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition: encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition: fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition: fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition: fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition: int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition: encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition: coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition: fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition: encoding.h:68
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition: error.h:48
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition: error.c:421
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3148
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:684
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition: error.c:3260
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:794
VALUE rb_eRangeError
RangeError exception.
Definition: error.c:1095
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1091
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition: error.c:3199
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition: error.c:1098
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1089
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1092
VALUE rb_eIndexError
IndexError exception.
Definition: error.c:1093
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:993
VALUE rb_cObject
Documented in include/ruby/internal/globals.h.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition: object.c:589
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition: object.c:1939
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition: object.c:1194
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition: object.c:3412
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:190
VALUE rb_cSymbol
Sumbol class.
Definition: string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition: object.c:122
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition: object.c:1182
VALUE rb_mComparable
Comparable module.
Definition: compar.c:19
VALUE rb_cString
String class.
Definition: string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:3022
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition: rgengc.h:220
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1208
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:821
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition: string.c:1074
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition: string.c:2715
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition: string.c:1093
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition: string.c:11905
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition: re.c:249
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition: string.c:2060
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition: string.c:3260
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition: string.c:1313
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition: string.c:1214
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:833
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition: string.c:11916
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:719
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition: symbol.c:407
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1453
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2630
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2884
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1709
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition: vm_eval.c:1190
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition: enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition: enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition: error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition: error.h:264
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition: string.c:604
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition: io.c:200
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition: vm.c:1662
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition: symbol.c:1009
void rb_backref_set(VALUE md)
Updates $~.
Definition: vm.c:1668
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition: range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition: re.c:1229
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition: re.c:4107
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition: re.c:3590
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition: re.c:1435
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition: re.c:1861
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition: string.c:11886
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition: string.c:1571
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: string.c:1376
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition: string.c:2211
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition: string.h:1583
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition: string.c:3323
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition: string.c:1289
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition: string.c:11532
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition: string.c:2283
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition: string.c:1265
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1565
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition: string.c:2743
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition: string.c:4793
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition: string.c:3547
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition: string.c:2825
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:10825
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition: random.c:1741
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition: string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1618
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition: string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition: string.c:1056
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition: string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:871
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1382
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1834
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2437
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition: string.c:3537
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition: string.c:3149
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition: string.c:2149
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition: string.c:1840
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition: string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition: string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition: string.c:5995
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition: string.c:2833
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition: string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition: string.c:11899
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition: string.c:1295
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition: string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition: string.c:3291
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition: string.c:2790
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition: string.c:3649
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3019
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition: string.c:6677
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition: string.c:2488
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition: string.c:11892
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition: string.c:3603
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition: string.c:3423
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition: string.c:3578
#define rb_strlen_lit(str)
Length of a string literal.
Definition: string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition: string.c:3267
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition: string.c:2941
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition: string.c:5297
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition: string.c:10883
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition: string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition: string.c:1513
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition: string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2639
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition: string.c:2920
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition: string.c:3002
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3036
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition: string.c:1068
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition: string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2445
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:6791
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition: string.c:1277
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1532
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition: string.c:2163
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition: string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5223
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition: string.c:8861
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition: string.c:1062
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition: symbol.c:844
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition: string.c:1682
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2823
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition: vm_method.c:1159
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:942
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition: string.c:11859
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: string.c:11849
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: symbol.c:795
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition: re.c:1765
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition: re.c:3369
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition: re.c:4351
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition: sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition: vm_eval.c:1358
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:366
#define ALLOCA_N(type, n)
Definition: memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition: memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
Definition: cxxanyargs.hpp:136
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition: rarray.h:69
#define RBASIC(obj)
Convenient casting macro.
Definition: rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition: rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition: rmatch.h:139
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition: rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition: string.c:1307
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition: string.c:2616
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition: rstring.h:574
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition: string.c:2500
#define RSTRING(obj)
Convenient casting macro.
Definition: rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition: string.c:1301
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition: string.c:2511
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition: string.c:1609
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition: load.c:1306
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition: stdarg.h:35
VALUE flags
Per-object flags.
Definition: rbasic.h:77
Ruby's String.
Definition: rstring.h:231
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition: rstring.h:234
long capa
Capacity of *ptr.
Definition: rstring.h:268
struct RString::@50::@52 embed
Embedded contents.
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
Definition: rstring.h:298
long len
Length of the string, not including terminating NUL character.
Definition: rstring.h:250
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition: rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition: rstring.h:258
Definition: st.h:79
Definition: string.c:7746
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition: thread.c:299
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52