diff options
author | Pierre Weis <Pierre.Weis@inria.fr> | 2010-05-02 18:38:35 +0000 |
---|---|---|
committer | Pierre Weis <Pierre.Weis@inria.fr> | 2010-05-02 18:38:35 +0000 |
commit | 8fdedc90353173fbc6c9f59ce7a7e32d97f2baf8 (patch) | |
tree | 3d377f31de018c4f2aed788f1d2e4398913eebcd /stdlib | |
parent | ec31aa44145d64613dedc83ec2f2bd5fd9f219c8 (diff) |
PR#4983. Test suite succesfully passed.
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@10344 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
Diffstat (limited to 'stdlib')
-rw-r--r-- | stdlib/scanf.ml | 972 |
1 files changed, 531 insertions, 441 deletions
diff --git a/stdlib/scanf.ml b/stdlib/scanf.ml index c9652b588..ac87aad64 100644 --- a/stdlib/scanf.ml +++ b/stdlib/scanf.ml @@ -18,379 +18,379 @@ (* Scanning buffers. *) module type SCANNING = sig -type in_channel;; - -type scanbuf = in_channel;; - -val stdin : in_channel;; -(* The scanning buffer reading from [Pervasives.stdin]. - [stdib] is equivalent to [Scanning.from_channel Pervasives.stdin]. *) - -val stdib : in_channel;; -(* An alias for [Scanf.stdin], the scanning buffer reading from - [Pervasives.stdin]. *) - -val next_char : scanbuf -> char;; -(* [Scanning.next_char ib] advance the scanning buffer for - one character. - If no more character can be read, sets a end of file condition and - returns '\000'. *) - -val invalidate_current_char : scanbuf -> unit;; -(* [Scanning.invalidate_current_char ib] mark the current_char as already - scanned. *) - -val peek_char : scanbuf -> char;; -(* [Scanning.peek_char ib] returns the current char available in - the buffer or reads one if necessary (when the current character is - already scanned). - If no character can be read, sets an end of file condition and - returns '\000'. *) - -val checked_peek_char : scanbuf -> char;; -(* Same as above but always returns a valid char or fails: - instead of returning a null char when the reading method of the - input buffer has reached an end of file, the function raises exception - [End_of_file]. *) - -val store_char : int -> scanbuf -> char -> int;; -(* [Scanning.store_char lim ib c] adds [c] to the token buffer - of the scanning buffer. It also advances the scanning buffer for one - character and returns [lim - 1], indicating the new limit - for the length of the current token. *) - -val skip_char : int -> scanbuf -> int;; -(* [Scanning.skip_char lim ib] ignores the current character. *) - -val ignore_char : int -> scanbuf -> int;; -(* [Scanning.ignore_char ib lim] ignores the current character and - decrements the limit. *) - -val token : scanbuf -> string;; -(* [Scanning.token ib] returns the string stored into the token - buffer of the scanning buffer: it returns the token matched by the - format. *) - -val reset_token : scanbuf -> unit;; -(* [Scanning.reset_token ib] resets the token buffer of - the given scanning buffer. *) - -val char_count : scanbuf -> int;; -(* [Scanning.char_count ib] returns the number of characters - read so far from the given buffer. *) - -val line_count : scanbuf -> int;; -(* [Scanning.line_count ib] returns the number of new line - characters read so far from the given buffer. *) - -val token_count : scanbuf -> int;; -(* [Scanning.token_count ib] returns the number of tokens read - so far from [ib]. *) - -val eof : scanbuf -> bool;; -(* [Scanning.eof ib] returns the end of input condition - of the given buffer. *) - -val end_of_input : scanbuf -> bool;; -(* [Scanning.end_of_input ib] tests the end of input condition - of the given buffer (if no char has ever been read, an attempt to - read one is performed). *) - -val beginning_of_input : scanbuf -> bool;; -(* [Scanning.beginning_of_input ib] tests the beginning of input - condition of the given buffer. *) - -val name_of_input : scanbuf -> string;; -(* [Scanning.name_of_input ib] returns the name of the character - source for input buffer [ib]. *) - -val open_in : string -> scanbuf;; -val open_in_bin : string -> scanbuf;; -val from_file : string -> scanbuf;; -val from_file_bin : string -> scanbuf;; -val from_string : string -> scanbuf;; -val from_function : (unit -> char) -> scanbuf;; -val from_channel : Pervasives.in_channel -> scanbuf;; - -val close_in : scanbuf -> unit;; + type in_channel;; + + type scanbuf = in_channel;; + + val stdin : in_channel;; + (* The scanning buffer reading from [Pervasives.stdin]. + [stdib] is equivalent to [Scanning.from_channel Pervasives.stdin]. *) + + val stdib : in_channel;; + (* An alias for [Scanf.stdin], the scanning buffer reading from + [Pervasives.stdin]. *) + + val next_char : scanbuf -> char;; + (* [Scanning.next_char ib] advance the scanning buffer for + one character. + If no more character can be read, sets a end of file condition and + returns '\000'. *) + + val invalidate_current_char : scanbuf -> unit;; + (* [Scanning.invalidate_current_char ib] mark the current_char as already + scanned. *) + + val peek_char : scanbuf -> char;; + (* [Scanning.peek_char ib] returns the current char available in + the buffer or reads one if necessary (when the current character is + already scanned). + If no character can be read, sets an end of file condition and + returns '\000'. *) + + val checked_peek_char : scanbuf -> char;; + (* Same as above but always returns a valid char or fails: + instead of returning a null char when the reading method of the + input buffer has reached an end of file, the function raises exception + [End_of_file]. *) + + val store_char : int -> scanbuf -> char -> int;; + (* [Scanning.store_char lim ib c] adds [c] to the token buffer + of the scanning buffer. It also advances the scanning buffer for one + character and returns [lim - 1], indicating the new limit + for the length of the current token. *) + + val skip_char : int -> scanbuf -> int;; + (* [Scanning.skip_char lim ib] ignores the current character. *) + + val ignore_char : int -> scanbuf -> int;; + (* [Scanning.ignore_char ib lim] ignores the current character and + decrements the limit. *) + + val token : scanbuf -> string;; + (* [Scanning.token ib] returns the string stored into the token + buffer of the scanning buffer: it returns the token matched by the + format. *) + + val reset_token : scanbuf -> unit;; + (* [Scanning.reset_token ib] resets the token buffer of + the given scanning buffer. *) + + val char_count : scanbuf -> int;; + (* [Scanning.char_count ib] returns the number of characters + read so far from the given buffer. *) + + val line_count : scanbuf -> int;; + (* [Scanning.line_count ib] returns the number of new line + characters read so far from the given buffer. *) + + val token_count : scanbuf -> int;; + (* [Scanning.token_count ib] returns the number of tokens read + so far from [ib]. *) + + val eof : scanbuf -> bool;; + (* [Scanning.eof ib] returns the end of input condition + of the given buffer. *) + + val end_of_input : scanbuf -> bool;; + (* [Scanning.end_of_input ib] tests the end of input condition + of the given buffer (if no char has ever been read, an attempt to + read one is performed). *) + + val beginning_of_input : scanbuf -> bool;; + (* [Scanning.beginning_of_input ib] tests the beginning of input + condition of the given buffer. *) + + val name_of_input : scanbuf -> string;; + (* [Scanning.name_of_input ib] returns the name of the character + source for input buffer [ib]. *) + + val open_in : string -> scanbuf;; + val open_in_bin : string -> scanbuf;; + val from_file : string -> scanbuf;; + val from_file_bin : string -> scanbuf;; + val from_string : string -> scanbuf;; + val from_function : (unit -> char) -> scanbuf;; + val from_channel : Pervasives.in_channel -> scanbuf;; + + val close_in : scanbuf -> unit;; end ;; module Scanning : SCANNING = struct -(* The run-time library for scanf. *) -type in_channel_name = - | From_file of string * Pervasives.in_channel - | From_string - | From_function - | From_channel of Pervasives.in_channel -;; - -type in_channel = { - mutable eof : bool; - mutable current_char : char; - mutable current_char_is_valid : bool; - mutable char_count : int; - mutable line_count : int; - mutable token_count : int; - mutable get_next_char : unit -> char; - tokbuf : Buffer.t; - input_name : in_channel_name; -} -;; - -type scanbuf = in_channel;; - -let null_char = '\000';; - -(* Reads a new character from input buffer. Next_char never fails, - even in case of end of input: it then simply sets the end of file - condition. *) -let next_char ib = - try - let c = ib.get_next_char () in - ib.current_char <- c; - ib.current_char_is_valid <- true; - ib.char_count <- succ ib.char_count; - if c = '\n' then ib.line_count <- succ ib.line_count; - c with - | End_of_file -> - let c = null_char in - ib.current_char <- c; - ib.current_char_is_valid <- false; - ib.eof <- true; + (* The run-time library for scanf. *) + type in_channel_name = + | From_file of string * Pervasives.in_channel + | From_string + | From_function + | From_channel of Pervasives.in_channel + ;; + + type in_channel = { + mutable eof : bool; + mutable current_char : char; + mutable current_char_is_valid : bool; + mutable char_count : int; + mutable line_count : int; + mutable token_count : int; + mutable get_next_char : unit -> char; + tokbuf : Buffer.t; + input_name : in_channel_name; + } + ;; + + type scanbuf = in_channel;; + + let null_char = '\000';; + + (* Reads a new character from input buffer. Next_char never fails, + even in case of end of input: it then simply sets the end of file + condition. *) + let next_char ib = + try + let c = ib.get_next_char () in + ib.current_char <- c; + ib.current_char_is_valid <- true; + ib.char_count <- succ ib.char_count; + if c = '\n' then ib.line_count <- succ ib.line_count; + c with + | End_of_file -> + let c = null_char in + ib.current_char <- c; + ib.current_char_is_valid <- false; + ib.eof <- true; + c + ;; + + let peek_char ib = + if ib.current_char_is_valid then ib.current_char else next_char ib;; + + (* Returns a valid current char for the input buffer. In particular + no irrelevant null character (as set by [next_char] in case of end + of input) is returned, since [End_of_file] is raised when + [next_char] sets the end of file condition while trying to read a + new character. *) + let checked_peek_char ib = + let c = peek_char ib in + if ib.eof then raise End_of_file; c -;; - -let peek_char ib = - if ib.current_char_is_valid then ib.current_char else next_char ib;; - -(* Returns a valid current char for the input buffer. In particular - no irrelevant null character (as set by [next_char] in case of end - of input) is returned, since [End_of_file] is raised when - [next_char] sets the end of file condition while trying to read a - new character. *) -let checked_peek_char ib = - let c = peek_char ib in - if ib.eof then raise End_of_file; - c -;; - -let end_of_input ib = - ignore (peek_char ib); - ib.eof -;; - -let eof ib = ib.eof;; - -let beginning_of_input ib = ib.char_count = 0;; -let name_of_input ib = - match ib.input_name with - | From_file (fname, _ic) -> fname - | From_string -> "unnamed character string" - | From_function -> "unnamed function" - | From_channel _ic -> "unnamed pervasives input channel" -;; - -let char_count ib = - if ib.current_char_is_valid then ib.char_count - 1 else ib.char_count -;; -let line_count ib = ib.line_count;; -let reset_token ib = Buffer.reset ib.tokbuf;; -let invalidate_current_char ib = ib.current_char_is_valid <- false;; - -let token ib = - let tokbuf = ib.tokbuf in - let tok = Buffer.contents tokbuf in - Buffer.clear tokbuf; - ib.token_count <- succ ib.token_count; - tok -;; - -let token_count ib = ib.token_count;; - -let skip_char max ib = - invalidate_current_char ib; - max -;; - -let ignore_char max ib = skip_char (max - 1) ib;; - -let store_char max ib c = - Buffer.add_char ib.tokbuf c; - ignore_char max ib -;; - -let default_token_buffer_size = 1024;; - -let create iname next = { - eof = false; - current_char = null_char; - current_char_is_valid = false; - char_count = 0; - line_count = 0; - token_count = 0; - get_next_char = next; - tokbuf = Buffer.create default_token_buffer_size; - input_name = iname; -} -;; - -let from_string s = - let i = ref 0 in - let len = String.length s in - let next () = - if !i >= len then raise End_of_file else - let c = s.[!i] in - incr i; - c in - create From_string next -;; - -let from_function = create From_function;; - -(* Scanning from an input channel. *) - -(* Position of the problem: - - We cannot prevent the scanning mechanism to use one lookahead character, - if needed by the semantics of the format string specifications (e.g. a - trailing ``skip space'' specification in the format string); in this case, - the mandatory lookahead character is indeed read from the input and not - used to return the token read. It is thus mandatory to be able to store - an unused lookahead character somewhere to get it as the first character - of the next scan. - - To circumvent this problem, all the scanning functions get a low level - input buffer argument where they store the lookahead character when - needed; additionally, the input buffer is the only source of character of - a scanner. The [scanbuf] input buffers are defined in module {!Scanning}. - - Now we understand that it is extremely important that related successive - calls to scanners indeed read from the same input buffer. In effect, if a - scanner [scan1] is reading from [ib1] and stores an unused lookahead - character [c1] into its input buffer [ib1], then another scanner [scan2] - not reading from the same buffer [ib1] will miss the character [c], - seemingly vanished in the air from the point of view of [scan2]. - - This mechanism works perfectly to read from strings, from files, and from - functions, since in those cases, allocating two buffers reading from the - same source is unnatural. - - Still, there is a difficulty in the case of scanning from an input - channel. In effect, when scanning from an input channel [ic], this channel - may not have been allocated from within this library. Hence, it may be - shared (two functions of the user's program may successively read from - [ic]). This is highly error prone since, one of the function may seek the - input channel, while the other function has still an unused lookahead - character in its input buffer. In conclusion, you should never mix direct - low level reading and high level scanning from the same input channel. - - This phenomenon of reading mess is even worse when one defines more than - one scanning buffer reading from the same input channel - [ic]. Unfortunately, we have no simple way to get rid of this problem - (unless the basic input channel API is modified to offer a ``consider this - char as unread'' procedure to keep back the unused lookahead character as - available in the input channel for further reading). - - To prevent some of the confusion the scanning buffer allocation function - is a memo function that never allocates two different scanning buffers for - the same input channel. This way, the user can naively perform successive - call to [fscanf] below, without allocating a new scanning buffer at each - invocation and hence preserving the expected semantics. - - As mentioned above, a more ambitious fix could be to change the input - channel API to allow arbitrary mixing of direct and formatted reading from - input channels. *) - -(* Perform bufferized input to improve efficiency. *) -let file_buffer_size = ref 1024;; - -(* The scanner closes the input channel at end of input. *) -let scan_close_at_end ic = close_in ic; raise End_of_file;; - -(* The scanner does not close the input channel at end of input: - it just raises [End_of_file]. *) -let scan_raise_at_end _ic = raise End_of_file;; - -let from_ic scan_close_ic iname ic = - let len = !file_buffer_size in - let buf = String.create len in - let i = ref 0 in - let lim = ref 0 in - let eof = ref false in - let next () = - if !i < !lim then begin let c = buf.[!i] in incr i; c end else - if !eof then raise End_of_file else begin - lim := input ic buf 0 len; - if !lim = 0 then begin eof := true; scan_close_ic ic end else begin - i := 1; - buf.[0] - end - end in - create iname next -;; - -let from_ic_close_at_end = from_ic scan_close_at_end;; - -(* The scanning buffer reading from [Pervasives.stdin]. - One could try to define [stdib] as a scanning buffer reading a character at a - time (no bufferization at all), but unfortunately the top-level - interaction would be wrong. - This is due to some kind of ``race condition'' when reading from [Pervasives.stdin], - since the interactive compiler and [scanf] will simultaneously read the - material they need from [Pervasives.stdin]; then, confusion will result from what should - be read by the top-level and what should be read by [scanf]. - This is even more complicated by the one character lookahead that [scanf] - is sometimes obliged to maintain: the lookahead character will be available - for the next ([scanf]) entry, seemingly coming from nowhere. - Also no [End_of_file] is raised when reading from stdin: if not enough - characters have been read, we simply ask to read more. *) -let stdin = - from_ic scan_raise_at_end - (From_file ("-", Pervasives.stdin)) Pervasives.stdin -;; - -let stdib = stdin;; - -let open_in fname = - match fname with - | "-" -> stdin - | fname -> - let ic = open_in fname in - from_ic_close_at_end (From_file (fname, ic)) ic -;; - -let open_in_bin fname = - match fname with - | "-" -> stdin - | fname -> - let ic = open_in_bin fname in - from_ic_close_at_end (From_file (fname, ic)) ic -;; - -let from_file = open_in;; -let from_file_bin = open_in_bin;; - -let memo_from_ic = - let memo = ref [] in - (fun scan_close_ic ic -> - try List.assq ic !memo with - | Not_found -> - let ib = from_ic scan_close_ic (From_channel ic) ic in - memo := (ic, ib) :: !memo; - ib) -;; - -let from_channel = memo_from_ic scan_raise_at_end;; - -let close_in ib = - match ib.input_name with - | From_file (_fname, ic) -> Pervasives.close_in ic - | From_string | From_function -> () - | From_channel ic -> Pervasives.close_in ic -;; + ;; + + let end_of_input ib = + ignore (peek_char ib); + ib.eof + ;; + + let eof ib = ib.eof;; + + let beginning_of_input ib = ib.char_count = 0;; + let name_of_input ib = + match ib.input_name with + | From_file (fname, _ic) -> fname + | From_string -> "unnamed character string" + | From_function -> "unnamed function" + | From_channel _ic -> "unnamed pervasives input channel" + ;; + + let char_count ib = + if ib.current_char_is_valid then ib.char_count - 1 else ib.char_count + ;; + let line_count ib = ib.line_count;; + let reset_token ib = Buffer.reset ib.tokbuf;; + let invalidate_current_char ib = ib.current_char_is_valid <- false;; + + let token ib = + let tokbuf = ib.tokbuf in + let tok = Buffer.contents tokbuf in + Buffer.clear tokbuf; + ib.token_count <- succ ib.token_count; + tok + ;; + + let token_count ib = ib.token_count;; + + let skip_char max ib = + invalidate_current_char ib; + max + ;; + + let ignore_char max ib = skip_char (max - 1) ib;; + + let store_char max ib c = + Buffer.add_char ib.tokbuf c; + ignore_char max ib + ;; + + let default_token_buffer_size = 1024;; + + let create iname next = { + eof = false; + current_char = null_char; + current_char_is_valid = false; + char_count = 0; + line_count = 0; + token_count = 0; + get_next_char = next; + tokbuf = Buffer.create default_token_buffer_size; + input_name = iname; + } + ;; + + let from_string s = + let i = ref 0 in + let len = String.length s in + let next () = + if !i >= len then raise End_of_file else + let c = s.[!i] in + incr i; + c in + create From_string next + ;; + + let from_function = create From_function;; + + (* Scanning from an input channel. *) + + (* Position of the problem: + + We cannot prevent the scanning mechanism to use one lookahead character, + if needed by the semantics of the format string specifications (e.g. a + trailing ``skip space'' specification in the format string); in this case, + the mandatory lookahead character is indeed read from the input and not + used to return the token read. It is thus mandatory to be able to store + an unused lookahead character somewhere to get it as the first character + of the next scan. + + To circumvent this problem, all the scanning functions get a low level + input buffer argument where they store the lookahead character when + needed; additionally, the input buffer is the only source of character of + a scanner. The [scanbuf] input buffers are defined in module {!Scanning}. + + Now we understand that it is extremely important that related successive + calls to scanners indeed read from the same input buffer. In effect, if a + scanner [scan1] is reading from [ib1] and stores an unused lookahead + character [c1] into its input buffer [ib1], then another scanner [scan2] + not reading from the same buffer [ib1] will miss the character [c], + seemingly vanished in the air from the point of view of [scan2]. + + This mechanism works perfectly to read from strings, from files, and from + functions, since in those cases, allocating two buffers reading from the + same source is unnatural. + + Still, there is a difficulty in the case of scanning from an input + channel. In effect, when scanning from an input channel [ic], this channel + may not have been allocated from within this library. Hence, it may be + shared (two functions of the user's program may successively read from + [ic]). This is highly error prone since, one of the function may seek the + input channel, while the other function has still an unused lookahead + character in its input buffer. In conclusion, you should never mix direct + low level reading and high level scanning from the same input channel. + + This phenomenon of reading mess is even worse when one defines more than + one scanning buffer reading from the same input channel + [ic]. Unfortunately, we have no simple way to get rid of this problem + (unless the basic input channel API is modified to offer a ``consider this + char as unread'' procedure to keep back the unused lookahead character as + available in the input channel for further reading). + + To prevent some of the confusion the scanning buffer allocation function + is a memo function that never allocates two different scanning buffers for + the same input channel. This way, the user can naively perform successive + call to [fscanf] below, without allocating a new scanning buffer at each + invocation and hence preserving the expected semantics. + + As mentioned above, a more ambitious fix could be to change the input + channel API to allow arbitrary mixing of direct and formatted reading from + input channels. *) + + (* Perform bufferized input to improve efficiency. *) + let file_buffer_size = ref 1024;; + + (* The scanner closes the input channel at end of input. *) + let scan_close_at_end ic = close_in ic; raise End_of_file;; + + (* The scanner does not close the input channel at end of input: + it just raises [End_of_file]. *) + let scan_raise_at_end _ic = raise End_of_file;; + + let from_ic scan_close_ic iname ic = + let len = !file_buffer_size in + let buf = String.create len in + let i = ref 0 in + let lim = ref 0 in + let eof = ref false in + let next () = + if !i < !lim then begin let c = buf.[!i] in incr i; c end else + if !eof then raise End_of_file else begin + lim := input ic buf 0 len; + if !lim = 0 then begin eof := true; scan_close_ic ic end else begin + i := 1; + buf.[0] + end + end in + create iname next + ;; + + let from_ic_close_at_end = from_ic scan_close_at_end;; + + (* The scanning buffer reading from [Pervasives.stdin]. + One could try to define [stdib] as a scanning buffer reading a character at a + time (no bufferization at all), but unfortunately the top-level + interaction would be wrong. + This is due to some kind of ``race condition'' when reading from [Pervasives.stdin], + since the interactive compiler and [scanf] will simultaneously read the + material they need from [Pervasives.stdin]; then, confusion will result from what should + be read by the top-level and what should be read by [scanf]. + This is even more complicated by the one character lookahead that [scanf] + is sometimes obliged to maintain: the lookahead character will be available + for the next ([scanf]) entry, seemingly coming from nowhere. + Also no [End_of_file] is raised when reading from stdin: if not enough + characters have been read, we simply ask to read more. *) + let stdin = + from_ic scan_raise_at_end + (From_file ("-", Pervasives.stdin)) Pervasives.stdin + ;; + + let stdib = stdin;; + + let open_in fname = + match fname with + | "-" -> stdin + | fname -> + let ic = open_in fname in + from_ic_close_at_end (From_file (fname, ic)) ic + ;; + + let open_in_bin fname = + match fname with + | "-" -> stdin + | fname -> + let ic = open_in_bin fname in + from_ic_close_at_end (From_file (fname, ic)) ic + ;; + + let from_file = open_in;; + let from_file_bin = open_in_bin;; + + let memo_from_ic = + let memo = ref [] in + (fun scan_close_ic ic -> + try List.assq ic !memo with + | Not_found -> + let ib = from_ic scan_close_ic (From_channel ic) ic in + memo := (ic, ib) :: !memo; + ib) + ;; + + let from_channel = memo_from_ic scan_raise_at_end;; + + let close_in ib = + match ib.input_name with + | From_file (_fname, ic) -> Pervasives.close_in ic + | From_string | From_function -> () + | From_channel ic -> Pervasives.close_in ic + ;; end ;; @@ -414,6 +414,39 @@ let bad_input_escape c = bad_input (Printf.sprintf "illegal escape character %C" c) ;; +let bad_input_char message c = + bad_input (Printf.sprintf "found character %C which is not %s" c message) +;; + +let bad_token_length message = + bad_input + (Printf.sprintf + "scanning of %s failed: \ + the specified length was too short for token" message) +;; + +let bad_end_of_input message = + bad_input + (Printf.sprintf + "scanning of %s failed: \ + premature end of file occurred before end of token" message) +;; + +let int_max = function + | None -> max_int + | Some max -> max +;; + +let int_min = function + | None -> 0 + | Some max -> max +;; + +let float_min = function + | None -> max_int + | Some min -> min +;; + module Sformat = Printf.CamlinternalPr.Sformat;; module Tformat = Printf.CamlinternalPr.Tformat;; @@ -549,14 +582,15 @@ let token_int64 conv ib = int64_of_string (token_int_literal conv ib);; (* Scanning numbers. *) -(* Digits scanning functions suppose that one character has been - checked and is available, since they return at end of file with the - currently found token selected. The digits scanning functions scan - a possibly empty sequence of digits, (hence a successful scanning - from one of those functions does not imply that the token is a - well-formed number: to get a true number, it is mandatory to check - that at least one digit is available before calling a digit - scanning function). *) +(* Digits scanning functions suppose that one character has been checked and + is available, since they return at end of file with the currently found + token selected. + + Put it in another way, the digits scanning functions scan for a possibly + empty sequence of digits, (hence, a successful scanning from one of those + functions does not imply that the token is a well-formed number: to get a + true number, it is mandatory to check that at least one valid digit is + available before calling one of the digit scanning functions). *) (* The decimal case is treated especially for optimization purposes. *) let rec scan_decimal_digits max ib = @@ -574,6 +608,7 @@ let rec scan_decimal_digits max ib = ;; let scan_decimal_digits_plus max ib = + if max = 0 then bad_token_length "decimal digits" else let c = Scanning.checked_peek_char ib in match c with | '0' .. '9' -> @@ -599,7 +634,11 @@ let scan_digits_plus digitp max ib = scan_digits max | _ -> max in + (* Ensure we have got enough width left, + and read at list one digit. *) + if max = 0 then bad_token_length "digits" else let c = Scanning.checked_peek_char ib in + if digitp c then let max = Scanning.store_char max ib c in scan_digits max @@ -711,20 +750,51 @@ let scan_int_part max ib = scan_decimal_digits max ib ;; -let scan_float max ib = +(* + For the time being we have (as found in scanf.mli): + The field width is composed of an optional integer literal + indicating the maximal width of the token to read. + Unfortunately, the type-checker let the user write an optional precision, + since this is valid for printf format strings. + + Thus, the next step for Scanf is to support a full width indication, more + or less similar to the one for printf, possibly extended to the + specification of a [max, min] range for the width of the token read for + strings. Something like the following spec for scanf.mli: + + The optional [width] is an integer indicating the maximal + width of the token read. For instance, [%6d] reads an integer, + having at most 6 characters. + + The optional [precision] is a dot [.] followed by an integer: + - in the floating point number conversions ([%f], [%e], [%g], [%F], [%E], and + [%F] conversions, the [precision] indicates the maximum number of digits + that may follow the decimal point. For instance, [%.4f] reads a [float] + with at most 4 fractional digits, + - in the string conversions ([%s], [%S], [%\[ range \]]), and in the + integer number conversions ([%i], [%d], [%u], [%x], [%o], and their + [int32], [int64], and [native_int] correspondent), the + [precision] indicates the required minimum width of the token read, + - on all other conversions, the width and precision are meaningless and + ignored (FIXME: lead to a runtime error ? type checking error ?). + +*) +let scan_float max max_frac_part ib = let max = scan_int_part max ib in - if max = 0 then max else + if max = 0 then max, max_frac_part else let c = Scanning.peek_char ib in - if Scanning.eof ib then max else + if Scanning.eof ib then max, max_frac_part else match c with | '.' -> let max = Scanning.store_char max ib c in - let max = scan_frac_part max ib in - scan_exp_part max ib - | c -> scan_exp_part max ib + let max_precision = min max max_frac_part in + let max = max - (max_precision - scan_frac_part max_precision ib) in + scan_exp_part max ib, max_frac_part + | c -> + scan_exp_part max ib, max_frac_part ;; -let scan_Float max ib = +let scan_Float max max_frac_part ib = let max = scan_optionally_signed_decimal_int max ib in if max = 0 then bad_float () else let c = Scanning.peek_char ib in @@ -732,6 +802,8 @@ let scan_Float max ib = match c with | '.' -> let max = Scanning.store_char max ib c in + let max_precision = min max max_frac_part in + let max = max - (max_precision - scan_frac_part max_precision ib) in let max = scan_frac_part max ib in scan_exp_part max ib | 'e' | 'E' -> @@ -761,6 +833,9 @@ let scan_string stp max ib = (* Scan a char: peek strictly one character in the input, whatsoever. *) let scan_char max ib = + (* The case max = 0 could not happen here, since it is tested before + calling scan_char, in the main scanning function. + if max = 0 then bad_token_length "a character" else *) Scanning.store_char max ib (Scanning.checked_peek_char ib) ;; @@ -817,18 +892,14 @@ let char_for_hexadecimal_code c1 c2 = (* Called in particular when encountering '\\' as starter of a char. Stops before the corresponding '\''. *) let check_next_char message max ib = - if max = 0 then - bad_input (Printf.sprintf "no more room to scan a %s token" message) else + if max = 0 then bad_token_length message else let c = Scanning.peek_char ib in - if Scanning.eof ib then - bad_input - (Printf.sprintf - "premature end of file while scanning a %s token" message) else + if Scanning.eof ib then bad_end_of_input message else c ;; -let check_next_char_for_char = check_next_char "char";; -let check_next_char_for_string = check_next_char "string";; +let check_next_char_for_char = check_next_char "a Char";; +let check_next_char_for_string = check_next_char "a String";; let scan_backslash_char max ib = match check_next_char_for_char max ib with @@ -861,7 +932,7 @@ let scan_backslash_char max ib = let scan_Char max ib = let rec find_start max = - match check_next_char_for_char max ib with + match Scanning.checked_peek_char ib with | '\'' -> find_char (Scanning.ignore_char max ib) | c -> character_mismatch '\'' c @@ -882,7 +953,7 @@ let scan_Char max ib = let scan_String max ib = let rec find_start max = - match check_next_char_for_string max ib with + match Scanning.checked_peek_char ib with | '\"' -> find_stop (Scanning.ignore_char max ib) | c -> character_mismatch '\"' c @@ -913,10 +984,7 @@ let scan_String max ib = (* Scan a boolean (a Caml token). *) let scan_bool max ib = - if max < 4 then - bad_input - (Printf.sprintf - "not enough room left (%d characters) to scan a boolean" max) else + if max < 4 then bad_token_length "a boolean" else let c = Scanning.checked_peek_char ib in let m = match c with @@ -1179,7 +1247,7 @@ let rec skip_whites ib = let scanf_bad_input ib = function | Scan_failure s | Failure s -> let i = Scanning.char_count ib in - bad_input (Printf.sprintf "scanf: bad input at char number %i: %s" i s) + bad_input (Printf.sprintf "scanf: bad input at char number %i: ``%s''" i s) | x -> raise x ;; @@ -1193,24 +1261,24 @@ let list_iter_i f l = let ascanf sc fmt = let ac = Tformat.ac_of_format fmt in - match ac.Tformat.ac_rdrs with - | 0 -> - Obj.magic (fun f -> sc fmt [||] f) - | 1 -> - Obj.magic (fun x f -> sc fmt [| Obj.repr x |] f) - | 2 -> - Obj.magic (fun x y f -> sc fmt [| Obj.repr x; Obj.repr y; |] f) - | 3 -> - Obj.magic - (fun x y z f -> sc fmt [| Obj.repr x; Obj.repr y; Obj.repr z; |] f) - | nargs -> - let rec loop i args = - if i >= nargs then - let a = Array.make nargs (Obj.repr 0) in - list_iter_i (fun i arg -> a.(nargs - i - 1) <- arg) args; - Obj.magic (fun f -> sc fmt a f) - else Obj.magic (fun x -> loop (succ i) (x :: args)) in - loop 0 [] + match ac.Tformat.ac_rdrs with + | 0 -> + Obj.magic (fun f -> sc fmt [||] f) + | 1 -> + Obj.magic (fun x f -> sc fmt [| Obj.repr x |] f) + | 2 -> + Obj.magic (fun x y f -> sc fmt [| Obj.repr x; Obj.repr y; |] f) + | 3 -> + Obj.magic + (fun x y z f -> sc fmt [| Obj.repr x; Obj.repr y; Obj.repr z; |] f) + | nargs -> + let rec loop i args = + if i >= nargs then + let a = Array.make nargs (Obj.repr 0) in + list_iter_i (fun i arg -> a.(nargs - i - 1) <- arg) args; + Obj.magic (fun f -> sc fmt a f) + else Obj.magic (fun x -> loop (succ i) (x :: args)) in + loop 0 [] ;; (* The [scan_format] main scanning function. @@ -1251,7 +1319,7 @@ let scan_format ib ef fmt rv f = if i > lim then ir, f else match Sformat.get fmt i with | ' ' -> skip_whites ib; scan_fmt ir f (succ i) - | '%' -> scan_conversion false max_int ir f (succ i) + | '%' -> scan_skip ir f (succ i) | '@' -> let i = succ i in if i > lim then incomplete_format fmt else begin @@ -1259,8 +1327,42 @@ let scan_format ib ef fmt rv f = scan_fmt ir f (succ i) end | c -> check_char ib c; scan_fmt ir f (succ i) - and scan_conversion skip max ir f i = + and scan_skip ir f i = + if i > lim then ir, f else + match Sformat.get fmt i with + | '_' -> scan_limits true ir f (succ i) + | _ -> scan_limits false ir f i + + and scan_limits skip ir f i = + if i > lim then ir, f else + let max_opt, min_opt, i = + match Sformat.get fmt i with + | '0' .. '9' as conv -> + let rec read_width accu i = + if i > lim then accu, i else + match Sformat.get fmt i with + | '0' .. '9' as c -> + let accu = 10 * accu + decimal_value_of_char c in + read_width accu (succ i) + | _ -> accu, i in + + let max, i = read_width (decimal_value_of_char conv) (succ i) in + + if i > lim then incomplete_format fmt else + begin + match Sformat.get fmt i with + | '.' -> + let min, i = read_width 0 (succ i) in + (Some max, Some min, i) + | _ -> Some max, None, i + end + | _ -> None, None, i in + + scan_conversion skip max_opt min_opt ir f i + + and scan_conversion skip max_opt min_opt ir f i = let stack = if skip then no_stack else stack in + let max = int_max max_opt in match Sformat.get fmt i with | '%' as conv -> check_char ib conv; scan_fmt ir f (succ i) @@ -1276,13 +1378,14 @@ let scan_format ib ef fmt rv f = let i, stp = scan_fmt_stoppers (succ i) in let _x = scan_chars_in_char_set stp char_set max ib in scan_fmt ir (stack f (token_string ib)) (succ i) - | 'c' when max = 0 -> + | ('c' | 'C') when max = 0 -> let c = Scanning.checked_peek_char ib in scan_fmt ir (stack f c) (succ i) - | 'c' | 'C' as conv -> - if max <> 1 && max <> max_int then bad_conversion fmt i conv else - let _x = - if conv = 'c' then scan_char max ib else scan_Char max ib in + | 'c' -> + let _x = scan_char max ib in + scan_fmt ir (stack f (token_char ib)) (succ i) + | 'C' -> + let _x = scan_Char max ib in scan_fmt ir (stack f (token_char ib)) (succ i) | 'd' | 'i' | 'o' | 'u' | 'x' | 'X' as conv -> let _x = scan_int_conv conv max ib in @@ -1290,11 +1393,16 @@ let scan_format ib ef fmt rv f = | 'N' as conv -> scan_fmt ir (stack f (get_count conv ib)) (succ i) | 'f' | 'e' | 'E' | 'g' | 'G' -> - let _x = scan_float max ib in + let min = float_min min_opt in + let _x = scan_float max min ib in scan_fmt ir (stack f (token_float ib)) (succ i) | 'F' -> - let _x = scan_Float max ib in + let min = float_min min_opt in + let _x = scan_Float max min ib in scan_fmt ir (stack f (token_float ib)) (succ i) +(* | 'B' | 'b' when max = Some 0 -> + let _x = scan_bool max ib in + scan_fmt ir (stack f (token_int ib)) (succ i) *) | 'B' | 'b' -> let _x = scan_bool max ib in scan_fmt ir (stack f (token_bool ib)) (succ i) @@ -1323,24 +1431,6 @@ let scan_format ib ef fmt rv f = else bad_input "end of input not found" | ',' -> scan_fmt ir f (succ i) - | '_' -> - if i > lim then incomplete_format fmt else - scan_conversion true max ir f (succ i) - | '0' .. '9' as conv -> - let rec read_width accu i = - if i > lim then accu, i else - match Sformat.get fmt i with - | '0' .. '9' as c -> - let accu = 10 * accu + decimal_value_of_char c in - read_width accu (succ i) - | _ -> accu, i in - let max, i = read_width (decimal_value_of_char conv) (succ i) in - if i > lim then incomplete_format fmt else begin - match Sformat.get fmt i with - | '.' -> - let p, i = read_width 0 (succ i) in - scan_conversion skip (succ (max + p)) ir f i - | _ -> scan_conversion skip max ir f i end | '(' | '{' as conv (* ')' '}' *) -> let i = succ i in (* Find the static specification for the format to read. *) |