# $Release Version: 1.1.2 $
# $Id: scanf.rb 44961 2014-02-15 01:33:03Z ryan $
# scanf is an implementation of the C function scanf(3), modified as necessary
# for Ruby compatibility.
# the methods provided are String#scanf, IO#scanf, and
# Kernel#scanf. Kernel#scanf is a wrapper around STDIN.scanf. IO#scanf
# can be used on any IO stream, including file handles and sockets.
# scanf can be called either with or without a block.
# Scanf scans an input string or stream according to a <b>format</b>, as
# described below in Conversions, and returns an array of matches between
# the format and the input. The format is defined in a string, and is
# similar (though not identical) to the formats used in Kernel#printf and
# The format may contain <b>conversion specifiers</b>, which tell scanf
# what form (type) each particular matched substring should be converted
# to (e.g., decimal integer, floating point number, literal string,
# etc.) The matches and conversions take place from left to right, and
# the conversions themselves are returned as an array.
# The format string may also contain characters other than those in the
# conversion specifiers. White space (blanks, tabs, or newlines) in the
# format string matches any amount of white space, including none, in
# the input. Everything else matches only itself.
# Scanning stops, and scanf returns, when any input character fails to
# match the specifications in the format string, or when input is
# exhausted, or when everything in the format string has been
# matched. All matches found up to the stopping point are returned in
# the return array (or yielded to the block, if a block was given).
# # String#scanf and IO#scanf take a single argument, the format string
# array = a_string.scanf("%d%s")
# array = an_io.scanf("%d%s")
# # Kernel#scanf reads from STDIN
# When called with a block, scanf keeps scanning the input, cycling back
# to the beginning of the format string, and yields a new array of
# conversions to the block every time the format string is matched
# (including partial matches, but not including complete failures). The
# actual return value of scanf when called with a block is an array
# containing the results of all the executions of the block.
# str = "123 abc 456 def 789 ghi"
# str.scanf("%d%s") { |num,str| [ num * 2, str.upcase ] }
# # => [[246, "ABC"], [912, "DEF"], [1578, "GHI"]]
# The single argument to scanf is a format string, which generally
# includes one or more conversion specifiers. Conversion specifiers
# begin with the percent character ('%') and include information about
# what scanf should next scan for (string, decimal number, single
# There may be an optional maximum field width, expressed as a decimal
# integer, between the % and the conversion. If no width is given, a
# default of `infinity' is used (with the exception of the %c specifier;
# see below). Otherwise, given a field width of <em>n</em> for a given
# conversion, at most <em>n</em> characters are scanned in processing
# that conversion. Before conversion begins, most conversions skip
# white space in the input string; this white space is not counted
# against the field width.
# The following conversions are available.
# Matches a literal `%'. That is, `%%' in the format string matches a
# single input `%' character. No conversion is done, and the resulting
# '%' is not included in the return array.
# Matches an optionally signed decimal integer.
# Matches an optionally signed integer. The integer is read in base
# 16 if it begins with `0x' or `0X', in base 8 if it begins with `0',
# and in base 10 other- wise. Only characters that correspond to the
# Matches an optionally signed octal integer.
# Matches an optionally signed hexadecimal integer,
# [a, e, f, g, A, E, F, G]
# Matches an optionally signed floating-point number.
# Matches a sequence of non-white-space character. The input string stops at
# white space or at the maximum field width, whichever occurs first.
# Matches a single character, or a sequence of <em>n</em> characters if a
# field width of <em>n</em> is specified. The usual skip of leading white
# space is suppressed. To skip white space first, use an explicit space in
# Matches a nonempty sequence of characters from the specified set
# of accepted characters. The usual skip of leading white space is
# suppressed. This bracketed sub-expression is interpreted exactly like a
# character class in a Ruby regular expression. (In fact, it is placed as-is
# in a regular expression.) The matching against the input string ends with
# the appearance of a character not in (or, with a circumflex, in) the set,
# or when the field width runs out, whichever comes first.
# === Assignment suppression
# To require that a particular match occur, but without including the result
# in the return array, place the <b>assignment suppression flag</b>, which is
# the star character ('*'), immediately after the leading '%' of a format
# specifier (just before the field width, if any).
# == scanf for Ruby compared with scanf in C
# scanf for Ruby is based on the C function scanf(3), but with modifications,
# dictated mainly by the underlying differences between the languages.
# === Unimplemented flags and specifiers
# * The only flag implemented in scanf for Ruby is '<tt>*</tt>' (ignore
# upcoming conversion). Many of the flags available in C versions of
# scanf(3) have to do with the type of upcoming pointer arguments, and are
# * The <tt>n</tt> specifier (store number of characters consumed so far in
# next pointer) is not implemented.
# * The <tt>p</tt> specifier (match a pointer value) is not implemented.
# In scanf for Ruby, all of these specifiers scan for an optionally signed
# integer, rather than for an unsigned integer like their C counterparts.
# scanf for Ruby returns an array of successful conversions, whereas
# scanf(3) returns the number of conversions successfully
# completed. (See below for more details on scanf for Ruby's return
# Without a block, scanf returns an array containing all the conversions
# it has found. If none are found, scanf will return an empty array. An
# unsuccessful match is never ignored, but rather always signals the end
# of the scanning operation. If the first unsuccessful match takes place
# after one or more successful matches have already taken place, the
# returned array will contain the results of those successful matches.
# With a block scanf returns a 'map'-like array of transformations from
# the block -- that is, an array reflecting what the block did with each
# yielded result from the iterative scanf operation. (See "Block
# == Current limitations and bugs
# When using IO#scanf under Windows, make sure you open your files in
# File.open("filename", "rb")
# so that scanf can keep track of characters correctly.
# Support for character classes is reasonably complete (since it
# essentially piggy-backs on Ruby's regular expression handling of
# character classes), but users are advised that character class testing
# has not been exhaustive, and that they should exercise some caution
# in using any of the more complex and/or arcane character class
# == License and copyright
# Copyright:: (c) 2002-2003 David Alan Black
# License:: Distributed on the same licensing terms as Ruby itself
# This software is provided "as is" and without any express or implied
# warranties, including, without limitation, the implied warranties of
# merchantability and fitness for a particular purpose.
# == Credits and acknowledgements
# scanf was developed as the major activity of the Austin Ruby Codefest
# (Austin, Texas, August 2002).
# Principal author:: David Alan Black (mailto:dblack@superlink.net)
# Co-author:: Hal Fulton (mailto:hal9000@hypermetrics.com)
# Project contributors:: Nolan Darilek, Jason Johnston
# Thanks to Hal Fulton for hosting the Codefest.
# Thanks to Matz for suggestions about the class design.
# Thanks to Gavin Sinclair for some feedback on the documentation.
# The text for parts of this document, especially the Description and
# Conversions sections, above, were adapted from the Linux Programmer's
# Manual manpage for scanf(3), dated 1995-11-01.
# == Bugs and bug reports
# scanf for Ruby is based on something of an amalgam of C scanf
# implementations and documentation, rather than on a single canonical
# description. Suggestions for features and behaviors which appear in
# other scanfs, and would be meaningful in Ruby, are welcome, as are
# reports of suspicious behaviors and/or bugs. (Please see "Credits and
# acknowledgements", above, for email addresses.)
# ===Rationale behind scanf for Ruby
# The impetus for a scanf implementation in Ruby comes chiefly from the fact
# that existing pattern matching operations, such as Regexp#match and
# String#scan, return all results as strings, which have to be converted to
# integers or floats explicitly in cases where what's ultimately wanted are
# integer or float values.
# ===Design of scanf for Ruby
# scanf for Ruby is essentially a <format string>-to-<regular
# When scanf is called, a FormatString object is generated from the
# format string ("%d%s...") argument. The FormatString object breaks the
# format string down into atoms ("%d", "%5f", "blah", etc.), and from
# each atom it creates a FormatSpecifier object, which it
# Each FormatSpecifier has a regular expression fragment and a "handler"
# associated with it. For example, the regular expression fragment
# associated with the format "%d" is "([-+]?\d+)", and the handler
# associated with it is a wrapper around String#to_i. scanf itself calls
# FormatString#match, passing in the input string. FormatString#match
# iterates through its FormatSpecifiers; for each one, it matches the
# corresponding regular expression fragment against the string. If
# there's a match, it sends the matched string to the handler associated
# with the FormatSpecifier.
# Thus, to follow up the "%d" example: if "123" occurs in the input
# string when a FormatSpecifier consisting of "%d" is reached, the "123"
# will be matched against "([-+]?\d+)", and the matched string will be
# rendered into an integer by a call to to_i.
# The rendered match is then saved to an accumulator array, and the
# input string is reduced to the post-match substring. Thus the string
# is "eaten" from the left as the FormatSpecifiers are applied in
# sequence. (This is done to a duplicate string; the original string is
# As soon as a regular expression fragment fails to match the string, or
# when the FormatString object runs out of FormatSpecifiers, scanning
# stops and results accumulated so far are returned in an array.
attr_reader :re_string, :matched_string, :conversion, :matched
def skip; /^\s*%\*/.match(@spec_string); end
return nil unless s &&! skip
if /\A(?<sign>[-+]?)0[xX](?<frac>\.\h+|\h+(?:\.\h*)?)[pP](?<exp>[-+]\d+)/ =~ s
f += f2.hex / (16.0 ** len)
(sign == ?- ? -1 : 1) * Math.ldexp(f, exp.to_i)
elsif /\A([-+]?\d+)\.([eE][-+]\d+)/ =~ s
def extract_decimal(s); s.to_i if s &&! skip; end
def extract_hex(s); s.hex if s &&! skip; end
def extract_octal(s); s.oct if s &&! skip; end
def extract_integer(s); Integer(s) if s &&! skip; end
def extract_plain(s); s unless skip; end
def nil_proc(s); nil; end
/(?:\A|\S)%\*?\d*c|%\d*\[/.match(@spec_string)
when /%\*?(\[\[:[a-z]+:\]\])/
[ "(#{$1}+)", :extract_plain ]
when /%\*?(\d+)(\[\[:[a-z]+:\]\])/
[ "(#{$2}{1,#{$1}})", :extract_plain ]
if /^\^/.match(yes) then no = yes[1..-1] else no = '^' + yes end
[ "([#{yes}]+)(?=[#{no}]|\\z)", :extract_plain ]
when /%\*?(\d+)\[([^\]]*)\]/
[ "([#{yes}]{1,#{w}})", :extract_plain ]
[ "([-+]?(?:(?:0[0-7]+)|(?:0[Xx]#{h}+)|(?:[1-9]\\d*)))", :extract_integer ]
if n > 1 then s += "[1-9]\\d{1,#{n-1}}|" end
if n > 1 then s += "0[0-7]{1,#{n-1}}|" end
if n > 2 then s += "[-+]0[0-7]{1,#{n-2}}|" end
if n > 2 then s += "[-+][1-9]\\d{1,#{n-2}}|" end
if n > 2 then s += "0[Xx]#{h}{1,#{n-2}}|" end
if n > 3 then s += "[-+]0[Xx]#{h}{1,#{n-3}}|" end
[ '([-+]?\d+)', :extract_decimal ]
if n > 1 then s += "[-+]\\d{1,#{n-1}}|" end
[ "([-+]?(?:0[Xx])?#{h}+)", :extract_hex ]
if n > 3 then s += "[-+]0[Xx]#{h}{1,#{n-3}}|" end
if n > 2 then s += "0[Xx]#{h}{1,#{n-2}}|" end
if n > 1 then s += "[-+]#{h}{1,#{n-1}}|" end
[ '([-+]?[0-7]+)', :extract_octal ]
[ "([-+][0-7]{1,#{$1.to_i-1}}|[0-7]{1,#{$1}})", :extract_octal ]
[ '([-+]?(?:0[xX](?:\.\h+|\h+(?:\.\h*)?)[pP][-+]\d+|\d+(?![\d.])|\d*\.\d*(?:[eE][-+]?\d+)?))', :extract_float ]
when /%\*?(\d+)[aefgAEFG]/
[ '(?=[-+]?(?:0[xX](?:\.\h+|\h+(?:\.\h*)?)[pP][-+]\d+|\d+(?![\d.])|\d*\.\d*(?:[eE][-+]?\d+)?))' +
"(\\S{1,#{$1}})", :extract_float ]
[ "(\\S{1,#{$1}})", :extract_plain ]
[ '(\S+)', :extract_plain ]
[ "\\s*(.)", :extract_plain ]
[ "(.)", :extract_plain ]
# %5c (whitespace issues are handled by the count_*_space? methods)
[ "(.{1,#{$1}})", :extract_plain ]
[ "(#{Regexp.escape(@spec_string)})", :nil_proc ]
@re_string = '\A' + @re_string
Regexp.new(@re_string,Regexp::MULTILINE)
s.sub!(/\A\s+/,'') unless count_space?
@conversion = send(@handler, res[1])
@matched_string = @conversion.to_s
@spec_string[/%\*?\d*([a-z\[])/, 1]
w = @spec_string[/%\*?(\d+)/, 1]
return false unless @matched
cc_no_width = letter == '[' &&! width
c_or_cc_width = (letter == 'c' || letter == '[') && width
width_left = c_or_cc_width && (matched_string.size < width)
return width_left || cc_no_width
attr_reader :string_left, :last_spec_tried,
:last_match_tried, :matched_count, :space
SPECIFIERS = 'diuXxofFeEgGscaA'
# possible space, followed by...
# percent sign, followed by...
# another percent sign, or...