jhr.bib

@inproceedings{3cps-design,
  author = {Benjamin Quiring and John Reppy and Olin Shivers},
  title = {3CPS: The Design of an Environment-focussed Intermediate Representation},
  booktitle = {33nd International Symposia on Implementation and Application of
    Functional Languages (IFL 2021)},
  year = 2021,
  month = sep,
  location = {Online (Hosted by Radboud University, Netherlands)},
  note = {Paper accepted for the conference post-proceedings},
  abstract = {
    We describe the design of 3CPS,
    a compiler intermediate representation (IR)
    we have developed for use in compiling
    call-by-value functional languages such as SML, OCaml, Scheme, and Lisp.
    The language is a low-level form designed in tandem
    with a matching suite of static analyses.
    It reflects our belief that the core task of an optimising
    compiler for a functional language is
    to reason about the environment structure of the program.
    Our IR is distinguished by the presence of
    \emph{extent annotations}, added to all variables (and verified by
    static analysis). These annotations are defined in terms of the
    semantics of the IR, but they directly tell the compiler what machine
    resources are needed to implement the environment structure of each
    annotated variable.
  },
  topic = {sml,implementation}
}

@inproceedings{smlnj-llvm-backend,
  author = {Kavon Farvardin and John Reppy},
  title = {A New Backend for Standard ML of New Jersey},
  booktitle = {32nd International Symposia on Implementation and Application of
    Functional Languages (IFL 2020)},
  year = 2020,
  month = sep,
  doi = {10.1145/3462172.3462191},
  location = {Online (Hosted by the University of Kent, UK)},
  pdf = {2020/ifl-smlnj-llvm.pdf},
  note = {Awarded the \textbf{Peter Landin Prize} for best paper.},
  abstract = {
    This paper describes the design and implementation of a new
    backend for the Standard ML of New Jersey (SML/NJ) system that is based
    on the LLVM compiler infrastructure.
    We first describe the history and design of the current backend, which is based
    on the \textsc{MLRisc} framework.
    While \textsc{MLRisc} has many similarities to LLVM, it provides a lower-level,
    policy agnostic, approach to code generation that enables customization
    of the code generator for non-standard runtime models (\textit{i.e.}, register
    pinning, calling conventions, \textit{etc}).
    In particular, SML/NJ uses a stackless runtime model based on continuation-passing
    style with heap-allocated continuation closures.
    This feature, and others, pose challenges to building a backend using LLVM.
    We describe these challenges and how we address them in our backend.
  },
  topic = {sml,implementation}
}

@inproceedings{stacks-and-continuations,
  author = {Farvardin, Kavon and Reppy, John},
  title = {From Folklore to Fact: Comparing Implementations of Stacks and Continuations},
  booktitle = {Proceedings of the SIGPLAN 2020 Conference on Programming
					Language Design and Implementation},
  year = 2020,
  month = jun,
  publisher = {ACM},
  address = {New York, NY},
  doi = {10.1145/3385412.3385994},
  pages = {75-90},
  pdf = {2020/pldi-stacks.pdf},
  note = {Awarded \textbf{Distinguished Paper}.},
  abstract = {
    The efficient implementation of function calls and non-local control transfers
    is a critical part of modern language implementations and is important in the
    implementation of everything from recursion, higher-order functions, concurrency
    and coroutines, to task-based parallelism.
    In a compiler, these features can be supported by a variety of mechanisms,
    including call stacks, segmented stacks, and heap-allocated continuation
    closures.

    An implementor of a high-level language with advanced control features might ask
    the question ``what is the best choice for my implementation?''
    Unfortunately, the current literature does not provide much guidance, since
    previous studies suffer from various flaws in methodology and are outdated for
    modern hardware.
    In the absence of recent, well-normalized measurements and a holistic overview
    of their implementation specifics, the path of least resistance when choosing a
    strategy is to trust folklore, but the folklore is also suspect.

    This paper attempts to remedy this situation by providing an
    ``apples-to-apples'' comparison of six different approaches to implementing
    call stacks and continuations.
    This comparison uses the same source language, compiler pipeline, LLVM-backend,
    and runtime system, with the only differences being those required by the
    differences in implementation strategy.
    We compare the implementation challenges of the different approaches, their
    sequential performance, and their suitability to support advanced control
    mechanisms, including supporting heavily threaded code.
    In addition to the comparison of implementation strategies, the paper's
    contributions also include a number of useful implementation techniques that we
    discovered along the way.
  },
  topic = {concurrency,manticore,sml,implementation}
}

@article{the-history-of-sml,
  author = {David MacQueen and Robert Harper and John Reppy},
  title = {The History of {Standard} {ML}},
  journal = {Proceedings of the {ACM} on Programming Languages},
  volume = 4,
  number = {HOPL},
  articleno = {86},
  year = {2020},
  month = jun,
  publisher = {ACM},
  address = {New York, NY},
  numpages = 94,
  doi = {10.1145/3386336},
  pdf = {2020/hopl-sml-history.pdf},
  abstract = {
    The ML family of strict functional languages, which includes F\#,
    OCaml, and Standard ML,
    evolved from the \emph{Meta Language}
    of the LCF theorem proving system developed by Robin Milner and his
    research group at the University of Edinburgh in the 1970s.
    This paper focuses on the history of Standard ML, which plays a
    central r\^{o}le in this family of languages, as it
    was the first to include the complete set of features that
    we now associate with the name ``ML'' (\emph{i.e.}, polymorphic type
    inference, datatypes with pattern matching, modules, exceptions,
    and mutable state).

    Standard ML, and the ML family of languages, have had enormous
    influence on the world of programming language design and theory.
    ML is the foremost exemplar of a functional programming language with
    strict evaluation (call-by-value) and static typing. The use of
    parametric polymorphism in its type system, together with the
    automatic inference of such types, has influenced a wide variety of modern
    languages (where polymorphism is often referred to as \emph{generics}).
    It has popularized the idea of datatypes with
    associated case analysis by pattern matching.
    The module system of Standard ML extends the notion of
    type-level parameterization to large-scale programming with the
    notion of parametric modules, or \emph{functors}.

    Standard ML also set a precedent by being a language whose design
    included a formal definition with an associated metatheory of mathematical
    proofs (such as soundness of the type system).
    A formal definition was one of the explicit goals from the beginning
    of the project.
    While some previous languages had rigorous definitions, these
    definitions were not integral to the design process, and the formal part
    was limited to the language syntax and possibly dynamic semantics or
    static semantics, but not both.

    The paper covers the early history of ML, the subsequent efforts to define
    a \emph{standard} ML language, and the development of its major features
    and its formal definition.
    We also review the impact that the language had on programming-language
    research.
  },
  topic = {sml}
}

@inproceedings{point-methods,
  author = {Teodoro Collin and Charisee Chiw and {L. Ridgway} Scott
	and John Reppy and Gordon Kindlmann},
  title = {Point Movement in a {DSL} for Higher-Order {FEM} Visualization},
  booktitle = {Proceedings of the 2019 {IEEE} Visualization Conference ({VIS} '19)},
  year = 2019,
  month = oct,
  location = {Vancouver, BC, Canada},
  pages = {281-285},
  doi = {10.1109/VISUAL.2019.8933623},
  pdf = {2019/vis-diderot-fem.pdf},
  abstract = {
    Scientific visualization tools tend to be flexible in some ways (\textit{e.g.},
    for exploring isovalues) while restricted in other ways, such as working
    only on regular grids, or only on unstructured meshes (as used in the
    finite element method, FEM).
    Our work seeks to expose the common structure
    of visualization methods, apart from the specifics of how the fields being
    visualized are formed.
    Recognizing that previous approaches to FEM visualization depend on
    efficiently updating computed positions within a mesh, we took an
    existing visualization domain-specific language, and added a mesh
    position type and associated arithmetic operators.
    These are orthogonal to the visualization method itself, so existing
    programs for visualizing regular grid data work, with minimal changes,
    on higher-order FEM data.
    We reproduce the efficiency gains of an earlier guided search method
    of mesh position update for computing streamlines, and we demonstrate
    a novel ability to uniformly sample ridge surfaces of higher-order
    FEM solutions defined on curved meshes.
  },
  topic = {diderot}
}

@inproceedings{shapes-and-flattening,
  author = {John Reppy and Joe Wingerter},
  title = {Shapes and Flattening},
  booktitle = {31st International Symposia on Implementation and Application of
    Functional Languages (IFL 2019)},
  year = 2019,
  month = sep,
  location = {Singapore, Singapore},
  publisher = {ACM},
  address = {New York, NY},
  pdf = {2019/ifl-shapes-final.pdf},
  doi = {10.1145/3412932.3412946},
  abstract = {
    \textsc{Nesl} is a first-order functional language with
    an apply-to-each construct and other parallel primitives that
    enable the expression of irregular nested data-parallel (NDP) algorithms.
    To compile \textsc{Nesl}, Blelloch and others developed a global flattening transformation
    that maps irregular NDP code into regular flat data parallel (FDP) code
    suitable for executing on SIMD or SIMT architectures, such as GPUs.

    While flattening solves the problem of mapping irregular parallelism into
    a regular model, it requires significant additional optimizations to produce
    performant code.
    Nessie is a compiler for \textsc{Nesl} that generates CUDA code for running on
    Nvidia GPUs.
    The Nessie compiler relies on a fairly complicated \emph{shape analysis}
    that is performed on the FDP code produced by the flattening transformation.
    Shape analysis plays a key r\^{o}le in the compiler as it is the enabler
    of fusion optimizations, smart kernel scheduling, and other optimizations.

    In this paper, we present a new approach to the shape analysis problem for
    \textsc{Nesl} that is both simpler to implement and provides better quality shape
    information.
    The key idea is to analyze the NDP representation of the program and then preserve
    shape information through the flattening transformation.
  },
  topic = {nessie,implementation,concurrency}
}

@inproceedings{compiling-successor-ml-guards,
  author = {John Reppy and Mona Zahir},
  title = {Compiling {Successor} {ML} Pattern Guards},
  booktitle = {Proceedings of the 2019 {ACM} {SIGPLAN} {ML} Family Workshop},
  year = 2019,
  month = aug,
  pdf = {2019/ml-guards.pdf},
  abstract = {
    Successor ML is a collection of proposed language
    extensions to Standard ML.  A number of these extensions
    address pattern matching; including adding richer record patterns,
    or-patterns, and pattern guards.  Pattern guards in
    Successor ML are more general than those found in other
    languages, which raises some interesting implementation
    issues.

    This paper describes the approach to pattern guards that we are developing
    as part of an effort to add Successor ML features to the Standard ML of New
    Jersey system.  We present our approach in a
    way that is applicable to either backtracking
    or decision-tree implementations of pattern matching.
  },
  topic = {implementation,sml}
}

@article{diderot:eurovis18,
  author = {Gordon L. Kindlmann and Charisee Chiw and Tri Huynh and Attila Gyulassy
	and John Reppy and PT Bremer},
  title = {Rendering and Extracting Extremal Features in 3D Fields},
  editor = {Jeffrey Heer and Heike Leitte and Timo Ropinski},
  journal = {{Computer} {Graphics} {Forum} ({Proceedings}
					of the {Eurographics}/{IEEE-VGTC}
					{Conference} on {Visualization} ``{EuroVis}'')},
  volume = {37},
  number = {3},
  year = {2018},
  month = jun,
  pages = {525-536},
  note = {Awarded \textbf{Best Paper}.},
  doi = {10.1111/cgf.13439},
  pdf = {2018/eurovis.pdf},
  abstract = {
    Visualizing and extracting three-dimensional features is important for many
    computational science applications, each with their own feature definitions and data types.
    While some are simple to state and implement (e.g., isosurfaces), others
    require more complicated mathematics (e.g., multiple derivatives, curvature,
    eigenvectors, etc.).
    Correctly implementing mathematical definitions is difficult, so experimenting
    with new features requires substantial investments.
    Furthermore, traditional interpolants rarely support the necessary derivatives,
    and approximations can reduce numerical stability.
    Our new approach directly translates mathematical notation into practical
    visualization and feature extraction, with minimal mental and implementation
    overhead.
    Using a mathematically expressive domain-specific language, Diderot, we
    compute direct volume renderings and particle-based feature samplings for
    a range of mathematical features. Non-expert users can experiment with
    feature definitions without any exposure to meshes, interpolants, derivative
    computation, etc.
    We demonstrate high-quality results on notoriously difficult features, such
    as ridges and vortex cores, using working code simple enough to be presented
    in its entirety.
  },
  topic = {diderot}
}

@inproceedings{compiling-w-cont-n-llvm:eptcs18,
  author = {Farvardin, Kavon and Reppy, John},
  title = {Compiling with Continuations and LLVM},
  year = {2018},
  editor = {Asai, Kenichi and Shinwell, Mark},
  booktitle = {Proceedings {ML} Family Workshop / {OCaml} Users and Developers workshops
		(September 22-23, 2016)},
  location = {Nara, Japan},
  series = {Electronic Proceedings in Theoretical Computer Science},
  volume = {285},
  publisher = {Open Publishing Association},
  pages = {131-142},
  doi = {10.4204/EPTCS.285.5},
  pdf = {2018/eptcs-llvm.pdf},
  abstract = {
    LLVM is an infrastructure for code generation and low-level optimizations,
    which has been gaining popularity as a backend for both research and industrial
    compilers, including many compilers for functional languages.
    While LLVM provides a relatively easy path to high-quality native code,
    its design is based on a traditional runtime model which is not well suited
    to alternative compilation strategies used in high-level language compilers,
    such as the use of heap-allocated continuation closures.

    This paper describes a new LLVM-based backend that supports heap-allocated
    continuation closures, which enables constant-time callcc and very-lightweight
    multithreading.
    The backend has been implemented in the Parallel ML compiler, which is part
    of the Manticore system, but the results should be useful for other compilers,
    such as Standard ML of New Jersey, that use heap-allocated continuation closures.
  },
  topic = {concurrency,manticore,sml}
}

@inproceedings{datm:ast17,
  author = {Charisee Chiw and Gordon Kindlmann and John Reppy},
  title = {{DATm}: {Diderot}'s Automated Testing Model},
  booktitle = {The 12th {IEEE}/{ACM} International Workshop
					on Automation of Software Test ({AST} 2017)},
  year = 2017,
  month = may,
  pages = {45-51},
  pdf = {2017/datm.pdf},
  abstract = {
    Diderot is a parallel domain-specific language for
    the analysis and visualization of multidimensional scientific images,
    such as those produced by CT and MRI scanners.
    Diderot is designed to support algorithms that are based on differential
    tensor calculus and produces a higher-order mathematical model, which allows
    direct manipulation of tensor fields.
    One of the main challenges of the Diderot implementation is bridging this
    semantic gap by effectively translating high-level mathematical notation of
    tensor calculus into efficient low-level code in the target language.

    A key question for a high-level language, such as Diderot, is how do we
    know that the implementation is correct.
    We have previously presented and defended a core set of rewriting rules,
    but the full translation from source to executable requires much more work.
    In this paper, we present \textbf{DATm}, Diderot's automated testing model
    to check the correctness of the core operations in the programming language.
    \textbf{DATm} can automatically create test programs, and predict what the outcome
    should be.
    We measure the accuracy of the computations written in the Diderot language,
    based on how accurately the output of the program represents the mathematical
    equivalent of the computations.

    This paper describes a model for testing a high-level language based on correctness.
    It introduces the pipeline for \textbf{DATm}, a tool that can automatically create
    and test tens of thousands of Diderot test programs and that has found numerous bugs.
    We make a case for the necessity of extensive testing by describing bugs that are
    deep in the compiler, and only could be found with a unique application of operations.
    Lastly, we demonstrate that the model can be used to create other types of tests
    by visual verification.
  },
  topic = {diderot}
}

@inproceedings{compiling-w-cont-n-llvm:ml16,
  author = {Kavon Farvardin and John Reppy},
  title = {Compiling with continuations and {LLVM}},
  booktitle = {Proceedings of the 2016 {ACM} {SIGPLAN} Workshop on {ML}},
  year = 2016,
  month = sep,
  pdf = {2016/ml-llvm.pdf},
  abstract = {
    This paper describes a new LLVM-based backend for the Parallel ML compiler
    (part of the Manticore system).
    This backend is novel in that it supports heap-allocated first-class
    continuations (a first for LLVM), which, in turn enables language features,
    such as \texttt{callcc}, lightweight concurrency mechanisms, and PML's
    parallelism features.
  },
  topic = {concurrency,manticore,sml}
}

@misc{lambdacu:cpc16,
  author = {John Reppy and Joe Wingerter},
  title = {$\lambda_{\textit{cu}}$ --- An Intermediate Representation for Compiling Nested Data Parallelism},
  howpublished = {Presented at the \textit{Compilers for Parallel Computing Workshop {(CPC '16)}}},
  year = 2016,
  month = jul,
  note = {Valladolid, Spain},
  pdf = {2016/cpc16-lambdacu.pdf},
  abstract = {
      Modern GPUs provide supercomputer-level performance at commodity prices, but
      they are notoriously hard to program.
      GPUs enable a vast degree of parallelism,
      but only a small set of control-flow and data access patterns are efficient to run on GPU architectures.
      In order to provide programmers with a familiar, high-level programming paradigm that
      nonetheless maps efficiently to GPU hardware,
      we have been exploring the use of \emph{Nested Data Parallelism} (NDP),
      specifically the first-order functional language NESL.

      NESL, originally designed for SIMD architectures, is a functional language with
      an apply-to-each construct and other parallel primitives that
      enables the expression of irregular parallel algorithms;
      Blelloch and others developed a global flattening transformation
      that maps irregular NDP code into regular flat data parallel (FDP) code
      suitable for SIMD execution.
      Our prior work on the Nessie compiler targetted SIMT GPUs via CUDA,
      establishing the feasibility of such a translation,
      but with poor performance compared to tuned CUDA implementations by human experts,
      primarily due to allocation of and memory traffic to temporary arrays.

      In this work, we focus on a compiler IR, called $\lambda_{\textit{cu}}$
      that we have designed to support effective optimization of the FDP code
      produced by the flattening transformation.
      $\lambda_{\textit{cu}}$ is a three-level language consisting of a top-level representation for
      the CPU-level control flow, a mid-level language for representing the iteration
      structure of GPU kernels, and a low-level language for representing the computations
      performed on the GPU.

      $\lambda_{\textit{cu}}$ facilitates fusion of parallel operations
      by expressing iteration structures with a language of combinators,
      which obey a set of fusion rules described in this paper.
      Some fusion optimizations are mutually exclusive,
      so following Robinson \textit{et al.}
      we use an ILP solver to determine the optimal fusion of kernels
      and then perform the recommended fusions.
      Final generation of CUDA code is performed on each fused group of combinators,
      linking CUDA kernels using a backbone of generated C++ which directs program progress on the CPU.
    },
  topic = {concurrency,implementation,nessie}
}

@article{diderot-dsl:vis15,
  author = {Gordon Kindlmann and Charisee Chiw and Lamont Samuels and Nick Seltzer and John Reppy},
  title = {Diderot: a Domain-Specific Language for Portable Parallel Scientific Visualization and Image Analysis},
  journal = {{IEEE} Transactions on Visualization and Computer Graphics},
  publisher = {{IEEE} Computer Society Press},
  address = {Los Alamitos, CA},
  year = 2015,
  month = oct,
  pages = {867-876},
  doi = {10.1109/TVCG.2015.2467449},
  pdf = {2015/vis-diderot.pdf},
  abstract = {
      Many algorithms for scientific visualization and image analysis are
      rooted in the world of continuous scalar, vector, and tensor fields,
      but are programmed in low-level languages and libraries that obscure
      their mathematical foundations.  Diderot is a parallel domain-specific
      language that is designed to bridge this semantic gap by providing the
      programmer with a high-level, mathematical programming notation that
      allows direct expression of mathematical concepts in code.
      Furthermore, Diderot provides parallel performance that takes
      advantage of modern multicore processors and GPUs. The high-level
      notation allows a concise and natural expression of the algorithms and
      the parallelism allows efficient execution on real-world datasets.
    },
  topic = {design,concurrency,diderot}
}

@misc{nessie:cpc15,
  author = {John Reppy and Nora Sandler},
  title = {Nessie: A {NESL} to {CUDA} Compiler},
  howpublished = {Presented at the \textit{Compilers for Parallel Computing Workshop {(CPC '15)}}},
  year = 2015,
  month = jan,
  note = {Imperial College, London, UK},
  pdf = {2015/cpc-nessie.pdf},
  abstract = {
      Modern GPUs provide supercomputer-level performance at commodity prices, but
      they are notoriously hard to program.
      To address this problem, we have been exploring the use of
      \emph{Nested Data Parallelism} (NDP), and
      specifically the first-order functional language Nesl, as a
      way to raise the level of abstraction for programming GPUs.
      This paper describes a new compiler for Nesl language that generated CUDA code.
      Specifically we describe three aspects of the compiler that address some of the
      challenges of generating efficient NDP code for GPUS.
    },
  topic = {concurrency,implementation,nessie}
}

@misc{diderot:cpc15,
  author = {John Reppy and Lamont Samuels},
  title = {Bulk-Synchronous Communication Mechanisms in Diderot},
  howpublished = {Presented at the \textit{Compilers for Parallel Computing Workshop {(CPC '15)}}},
  year = 2015,
  month = jan,
  note = {Imperial College, London, UK},
  pdf = {2015/cpc-diderot.pdf},
  abstract = {
      Diderot is a parallel domain-specific language designed to provide biomedical researchers
      with a high-level mathematical programming model where they can use familiar tensor calculus
      notations directly in code without dealing with  underlying low-level implementation details.
      These operations are executed as parallel independent computations. We use a bulk synchronous
      parallel model (BSP) to execute these independent computations as autonomous lightweight
      threads called strands.
      The current BSP model of Diderot limits strand creation to initialization time and does
      not provide any mechanisms for communicating between strands. For algorithms, such as
      particle systems, where strands are used to explore the image space, it is useful to be
      able to create new strands dynamically and share data between strands.

      In this paper, we present an updated BSP model with three new features:  a spatial
      mechanism that retrieves nearby strands based on their geometric position in space, a
      global mechanism for global computations (i.e., parallel reductions) over sets of strands
      and a mechanism for dynamically allocating new strands.  We also illustrate through examples
      how to express these features in the Diderot language. More, generally, by providing a
      communication system with these new mechanisms, we can effectively increase the class of
      applications that Diderot can support.
    },
  topic = {design,concurrency,diderot}
}

@inproceedings{reflow,
  author = {Lars Bergstrom and Matthew Fluet and Mike Rainey and Matthew Le
	and John Reppy and Nora Sandler},
  title = {Practical and Effective Higher-Order Optimizations},
  booktitle = {Proceedings of the 19th {ACM} {SIGPLAN} International
    Conference on Functional Programming (ICFP 2014)},
  publisher = {ACM},
  address = {New York, NY},
  year = 2014,
  month = sep,
  doi = {10.1145/2628136.2628153},
  pdf = {2014/icfp-reflow.pdf},
  abstract = {
      Inlining is an optimization that replaces a call to a function with that function's
      body.
      This optimization not only reduces the overhead of a function call, but can
      expose additional optimization opportunities to the compiler, such as removing
      redundant operations or unused conditional branches.
      Another optimization, copy propagation, replaces a redundant copy of a still-live
      variable with the original.
      Copy propagation can reduce the total number of live variables, reducing register
      pressure and memory usage, and possibly
      eliminating redundant memory-to-memory copies.
      In practice, both of these optimizations are implemented in nearly every
      modern compiler.

      These two optimizations are practical to implement and effective in first-order
      languages, but in languages with lexically-scoped first-class functions (aka, closures),
      these optimizations are not available to code programmed in a higher-order style.
      With higher-order functions, the analysis challenge has been that the
      environment at the call site must be the same as at the closure capture location,
      up to the free variables, or the meaning of the program may change.
      Olin Shivers' 1991 dissertation called this family of optimizations
      \emph{Super-$\beta$} and he proposed one analysis technique, called \emph{reflow},
      to support these optimizations.
      Unfortunately, reflow has proven too expensive to implement in practice.
      Because these higher-order optimizations are not available in functional-language
      compilers, programmers studiously avoid uses of
      higher-order values that cannot be optimized (particularly in compiler
      benchmarks).

      This paper provides the first practical and effective technique for
      Super-$\beta$ (higher-order) inlining and copy propagation, which
      we call unchanged variable analysis.
      We show that this technique is practical by implementing it in the
      context of a real compiler for an ML-family language and showing
      that the required analyses have costs below 3\% of the total
      compilation time.
      This technique's effectiveness is shown through a set of benchmarks
      and example programs, where this analysis exposes additional potential
      optimization sites.
    },
  topic = {implementation,manticore}
}

@inproceedings{sml3d,
  author = {John Reppy},
  title = {{SML3d}: 3D Graphics for {Standard} {ML}},
  booktitle = {Proceedings of the 2014 {ACM} {SIGPLAN} Workshop on {ML}},
  year = 2014,
  month = sep,
  pdf = {2014/ml-sml3d.pdf},
  abstract = {
    The SML3d system is a collection of libraries designed to support real-time 3D
    graphics programming in Standard ML (SML).
    This paper gives an overview of the system and briefly highlights some of the more
    interesting aspects of its design and implementation.
  },
  topic = {gui,sml}
}

@inproceedings{data-only-flattening,
  author = {Lars Bergstrom and Matthew Fluet and Mike Rainey and John Reppy
        and Stephen Rosen and Adam Shaw},
  title = {Data-Only Flattening for Nested Data Parallelism},
  booktitle = {Proceedings of the 2013 {ACM} {SIGPLAN} Symposium on Principles \&
                                        Practice of Parallel Programming ({PPoPP} 2013)},
  publisher = {ACM},
  address = {New York, NY},
  year = 2013,
  month = feb,
  pages = {81-92},
  doi = {10.1145/2442516.2442525},
  pdf = {2013/ppopp-flat.pdf},
  abstract = {
      Data parallelism has proven to be an effective technique for
      high-level programming of a certain class of parallel applications,
      but it is not well suited to irregular parallel computations.
      Blelloch and others proposed \emph{nested data parallelism} (NDP) as a
      language mechanism for programming irregular parallel applications
      in a declarative data-parallel style.
      The key to this approach is a compiler transformation that
      \emph{flattens} the NDP computation and data structures into a form
      that can be executed efficiently on a wide-vector SIMD architecture.
      Unfortunately, this technique is ill suited to execution on today's
      multicore machines.
      We present a new technique, called \emph{data-only flattening}, for
      the compilation of NDP, which is suitable for multicore architectures.
      Data-only flattening transforms nested data structures in order to
      expose programs to various optimizations while leaving control
      structures intact.
      We present a formal semantics of data-only flattening in a core
      language with a rewriting system.
      We demonstrate the effectiveness of this technique in the Parallel~ML
      implementation and we report encouraging experimental results across
      various benchmark applications.
    },
  topic = {implementation,concurrency,manticore}
}

@article{lazy-tree-splitting:jfp,
  author = {Lars Bergstrom and Matthew Fluet and Mike Rainey and John Reppy and Adam Shaw},
  title = {Lazy Tree Splitting},
  journal = {Journal of Functional Programming},
  publisher = {Cambridge University Press},
  address = {Cambridge, England},
  volume = 22,
  number = {4-5},
  pages = {382-438},
  year = 2012,
  month = sep,
  abstract = {
    Nested data-parallelism (NDP) is a language mechanism that supports
    programming irregular parallel applications in a declarative
    style.
    In this paper, we describe the implementation of NDP in Parallel ML (PML), which
    is part of the Manticore system.
    One of the main challenges of implementing NDP is managing the parallel decomposition
    of work.
    If we have too many small chunks of work, the overhead will be too high, but if we do
    not have enough chunks of work, processors will be idle.
    Recently the technique of Lazy Binary Splitting was proposed to address this problem
    for nested parallel loops over flat arrays.
    We have adapted this technique to our implementation of NDP, which uses binary trees
    to represent parallel arrays.
    This new technique, which we call \emph{Lazy Tree Splitting} (LTS), has the key
    advantage of \emph{performance robustness};
    \textit{i.e.}, that it does not require tuning to get the best performance for
    each program.
    We describe the implementation of the standard NDP operations using LTS and we present
    experimental data that demonstrates the scalability of LTS across a range of benchmarks.
  },
  topic = {implementation,concurrency,manticore}
}

@inproceedings{nesl-gpu,
  author = {Lars Bergstrom and John Reppy},
  title = {Nested Data-Parallelism on the {GPU}},
  booktitle = {Proceedings of the 17th {ACM} {SIGPLAN} International
    Conference on Functional Programming (ICFP 2012)},
  publisher = {ACM},
  address = {New York, NY},
  year = 2012,
  month = sep,
  pages = {247-258},
  pdf = {2012/icfp-gpu.pdf},
  abstract = {
      Graphics processing units (GPUs) provide both memory bandwidth and arithmetic
      performance far greater than that available on CPUs but, because of their
      \emph{Single-Instruction-Multiple-Data} (SIMD) architecture, they are hard to
      program.
      Most of the programs ported to GPUs thus far use traditional data-level
      parallelism, performing only operations that operate uniformly over vectors.

      NESL is a first-order functional language that was designed to allow
      programmers to write irregular-parallel programs --- such as parallel
      divide-and-conquer algorithms --- for wide-vector parallel computers.
      This paper presents our port of the NESL implementation to work on GPUs and
      provides empirical evidence that nested data-parallelism (NDP) on GPUs
      significantly outperforms CPU-based implementations and matches or beats newer
      GPU languages that support only flat parallelism.
      While our performance does not match that of hand-tuned CUDA programs, we argue
      that the notational conciseness of NESL is worth the loss in performance.
      This work provides the first language implementation that directly supports NDP
      on a GPU.
    },
  topic = {concurrency,implementation,nessie}
}

@inproceedings{diderot-parallel-dsl,
  author = {Charisee Chiw and Gordon Kindlmann and John Reppy and Lamont Samuels and Nick Seltzer},
  title = {Diderot: A Parallel {DSL} for Image Analysis and Visualization},
  booktitle = {Proceedings of the SIGPLAN 2012 Conference on Programming
					Language Design and Implementation},
  publisher = {ACM},
  address = {New York, NY},
  year = 2012,
  month = jun,
  pages = {111-120},
  pdf = {2012/pldi-diderot.pdf},
  abstract = {
    Research scientists and medical professionals use imaging technology, such as
    \emph{computed tomography} (CT) and
    \emph{magnetic resonance imaging} (MRI) to measure a wide
    variety of biological and physical objects.
    The increasing sophistication of imaging technology creates demand for
    equally sophisticated computational techniques to analyze and visualize the
    image data.
    Analysis and visualization codes are often crafted for a specific experiment
    or set of images, thus imaging scientists need support for quickly developing
    codes that are reliable, robust, and efficient.

    In this paper, we present the design and implementation of Diderot, which is a
    parallel domain-specific language for biomedical image analysis and visualization.
    Diderot supports a high-level model of computation that is based on continuous tensor fields.
    These tensor fields are reconstructed from
    discrete image data using separable convolution kernels, but may also be defined by
    applying higher-order operations, such as differentiation.
    Early experiments demonstrate that Diderot provides both a high-level concise notation
    for image analysis and visualization algorithms, as well as high sequential and parallel
    performance.
  },
  topic = {design,concurrency,diderot}
}

@inproceedings{gc-for-multicore-numa,
  author = {Sven Auhagen and Lars Bergstrom and Matthew Fluet and John Reppy},
  title = {Garbage Collection for Multicore {NUMA} Machines},
  booktitle = {Proceedings of the {ACM} {SIGPLAN} Workshop on
	Memory Systems Performance and Correctness ({MSPC} 2011)},
  publisher = {ACM},
  address = {New York, NY},
  year = 2011,
  month = jun,
  pages = {51-57},
  pdf = {2011/mspc-numa-gc.pdf},
  abstract = {
    Modern high-end machines feature multiple processor packages, each of which
    contains multiple independent cores and integrated memory controllers connected
    directly to dedicated physical RAM.
    These packages are connected via a shared bus, creating a system with a
    heterogeneous memory hierarchy.
    Since this shared bus has less bandwidth than the sum of the links to memory,
    aggregate memory bandwidth is higher when parallel threads all access memory
    local to their processor package than when they access memory attached to a
    remote package.
    This bandwidth limitation has traditionally limited the scalability of modern
    functional language implementations, which seldom scale well past 8~cores, even
    on small benchmarks.

    This work presents a garbage collector integrated with our strict, parallel
    functional language implementation, Manticore, and shows that it scales
    effectively on both a 48-core AMD Opteron machine and a 32-core Intel Xeon
    machine.
  },
  topic = {manticore,implementation}
}

@article{implicit-threaded-parallel-jfp,
  author = {Matthew Fluet and Mike Rainey and John Reppy and Adam Shaw},
  title = {Implicitly threaded parallelism in {Manticore}},
  journal = {Journal of Functional Programming},
  publisher = {Cambridge University Press},
  address = {Cambridge, England},
  volume = 20,
  number = {5-6},
  pages = {537-576},
  year = 2011,
  doi = {10.1017/S0956796810000201},
  abstract = {
    The increasing availability of commodity multicore processors is making parallel computing
    ever more widespread. In order to exploit its potential, programmers need languages that
    make the benefits of parallelism accessible and understandable.
    Previous parallel languages have traditionally been intended for large-scale scientific
    computing, and they tend not to be well suited to programming the applications one typically
    finds on a desktop system.
    Thus, we need new parallel-language designs that address a broader spectrum of applications.
    The Manticore project is our effort to address this need. At its core is Parallel ML,
    a high-level functional language for programming parallel applications on commodity multicore
    hardware.
    Parallel ML provides a diverse collection of parallel constructs for different granularities
    of work. In this paper, we focus on the implicitly threaded parallel constructs of the
    language, which support fine-grained parallelism. We concentrate on those elements that
    distinguish our design from related ones, namely, a novel parallel binding form, a
    nondeterministic parallel case form, and the treatment of exceptions in the presence of data
    parallelism.
    These features differentiate the present work from related work on functional data-parallel
    language designs, which have focused largely on parallel problems with regular structure and
    the compiler transformations --- most notably, flattening --- that make such designs feasible.
    We present detailed examples utilizing various mechanisms of the language and give a formal
    description of our implementation.
  },
  topic = {design,concurrency,manticore,implementation}
}

@inproceedings{particles,
  author = {Pavel Krajcevski and John Reppy},
  title = {A Declarative {API} for Particle Systems},
  booktitle = {Proceedings of the Thirteenth International Symposium on
	Practical Aspects of Declarative Languages ({PADL 2011})},
  year = 2011,
  month = jan,
  series = {Lecture Notes in Computer Science},
  volume = 6539,
  publisher = {Springer-Verlag},
  address = {New York, NY},
  pages = {130-144},
  pdf = {2011/padl-particles.pdf},
  abstract = {
    Recent trends in computer-graphics APIs and hardware have made it practical to
    use high-level functional languages for real-time graphics applications.
    Thus we have the opportunity to develop new approaches to computer
    graphics that take advantage of the high-level features of
    functional languages.
    This paper describes one such project that uses the techniques of functional programming
    to define and implement a combinator library for \textit{particle systems}.
    Particle systems are a popular
    technique for rendering fuzzy phenomena, such as fire, smoke, and explosions.
    Using our combinators, a programmer can provide a declarative specification of how a particle
    system behaves.
    This specification includes rules for how particles are created, how they evolve, and how they
    are rendered.
    Our library translates these declarative specifications into a low-level intermediate language
    that can be compiled to run on the GPU or interpreted by the CPU.
  },
  topic = {sml,gui}
}

@inproceedings{lazy-tree-splitting,
  author = {Lars Bergstrom and Mike Rainey and John Reppy and Adam Shaw and Matthew Fluet},
  title = {Lazy Tree Splitting},
  booktitle = {Proceedings of the 15th {ACM} {SIGPLAN} International
    Conference on Functional Programming (ICFP 2010)},
  publisher = {ACM},
  address = {New York, NY},
  year = 2010,
  month = sep,
  pages = {93-104},
  pdf = {2010/icfp-lts.pdf},
  abstract = {
    Nested data-parallelism (NDP) is a declarative style for programming
    irregular parallel applications. NDP languages provide language features
    favoring the NDP style, efficient compilation of NDP programs, and various
    common NDP operations like parallel maps, filters, and sum-like reductions.
    In this paper, we describe the implementation of NDP in Parallel ML
    (PML), part of the Manticore project. Managing the parallel decomposition
    of work is one of the main challenges of implementing NDP. If the
    decomposition creates too many small chunks of work, performance will
    be eroded by too much parallel overhead. If, on the other hand, there
    are too few large chunks of work, there will be too much sequential
    processing and processors will sit idle.

    Recently the technique of Lazy Binary Splitting was proposed for dynamic
    parallel decomposition of work on flat arrays, with promising results.
    We adapt Lazy Binary Splitting to parallel processing of binary trees,
    which we use to represent parallel arrays in PML. We call our technique
    Lazy Tree Splitting (LTS). One of its main advantages is its performance
    robustness: per-program tuning is not required to achieve good performance
    across varying platforms. We describe LTS-based implementations of
    standard NDP operations, and we present experimental data demonstrating
    the scalability of LTS across a range of benchmarks.
  },
  topic = {implementation,concurrency,manticore}
}

@inproceedings{programming-in-manticore,
  author = {Matthew Fluet and Lars Bergstrom and Nic Ford and Mike Rainey and John Reppy
          and Adam Shaw and Yingqi Xiao},
  title = {Programming in {Manticore}, a Heterogeneous Parallel Functional Language},
  booktitle = {Central European Functional Programming School},
  series = {Lecture Notes in Computer Science},
  volume = 6299,
  editor = {Zoltan Horv\'ath},
  year = 2010,
  pages = {94-145},
  publisher = {Springer-Verlag},
  address = {New York, NY},
  topic = {concurrency,manticore}
}

@inproceedings{arity-raising,
  author = {Lars Bergstrom and John Reppy},
  title = {Arity Raising in Manticore},
  booktitle = {21st International Symposia on Implementation and Application of
    Functional Languages (IFL 2009)},
  year = 2009,
  month = sep,
  series = {Lecture Notes in Computer Science},
  volume = 6041,
  publisher = {Springer-Verlag},
  address = {New York, NY},
  pages = {90-106},
  pdf = {2009/ifl-arity.pdf},
  abstract = {
    Compilers for polymorphic languages are required to treat values in programs in an
    abstract and generic way at the source level.
    The challenges of optimizing the boxing of raw values, flattening of argument
    tuples, and raising the arity of functions that handle complex structures to
    reduce memory usage are old ones, but take on newfound import with processors
    that have twice as many registers.
    We present a novel strategy that uses both control-flow and type information to
    provide an arity raising implementation addressing these problems.
    This strategy is \textit{conservative} --- no matter the execution
    path, the transformed program will not perform extra operations.
  },
  topic = {implementation,manticore}
}

@inproceedings{parallel-cml,
  author = {John Reppy and Claudio Russo and Yingqi Xiao},
  title = {Parallel Concurrent {ML}},
  booktitle = {Proceedings of the 14th {ACM} {SIGPLAN} International
    Conference on Functional Programming (ICFP 2009)},
  publisher = {ACM},
  address = {New York, NY},
  year = 2009,
  month = sep,
  pages = {257-268},
  doi = {https://doi.org/10.1145/1596550.1596588},
  pdf = {2009/icfp-parallel-cml.pdf},
  abstract = {
    Concurrent ML (CML) is a high-level message-passing language that supports
    the construction of first-class synchronous abstractions called
    events.  This mechanism has proven quite effective over the years and
    has been incorporated in a number of other languages.  While CML
    provides a concurrent programming model, its implementation has always
    been limited to uniprocessors.  This limitation is exploited in the
    implementation of the synchronization protocol that underlies the event
    mechanism, but with the advent of cheap parallel processing on the
    desktop (and laptop), it is time for Parallel CML.

    Parallel implementations of CML-like primitives for
    Java and Haskell exist, but build on high-level synchronization
    constructs that are unlikely to perform well.  This paper presents a
    novel, parallel implementation of CML that exploits a purpose-built
    optimistic concurrency protocol designed for both correctness and
    performance on shared-memory multiprocessors.
    This work extends and
    completes an earlier protocol that supported just a strict subset of
    CML with synchronization on input, but not output events.  Our main
    contributions are a model-checked reference implementation of the
    protocol and two concrete implementations.  This paper focuses on
    Manticore's functional, continuation-based implementation but briefly
    discusses an independent, thread-based implementation written in C\#
    and running on Microsoft's stock, parallel runtime. Although very
    different in detail, both derive from the same design. Experimental
    evaluation of the Manticore implementation reveals good
    performance, despite the extra overhead of multiprocessor synchronization.
  },
  topic = {cml,concurrency,implementation,manticore}
}

@article{re-derivs,
  author = {Scott Owens and John Reppy and Aaron Turon},
  title = {Regular-expression derivatives reexamined},
  journal = {Journal of Functional Programming},
  publisher = {Cambridge University Press},
  address = {Cambridge, England},
  volume = 19,
  number = 2,
  year = 2009,
  pages = {173-190},
  abstract = {
    Regular-expression derivatives are an old, but elegant, technique for compiling regular
    expressions to deterministic finite-state machines.
    It easily supports extending the regular-expression operators with boolean operations,
    such as intersection and complement.
    Unfortunately, this technique has been lost in the sands of time and few computer
    scientists are aware of it.
    In this paper, we reexamine regular-expression derivatives and report on our experiences in
    the context of two different functional-language implementations.
    The basic implementation is simple and we show how to extend it to handle large character
    sets (e.g., Unicode).
    We also show that the derivatives approach leads to smaller state machines than the
    traditional algorithm given by McNaughton and Yamada.
  },
  topic = {implementation}
}

@inproceedings{varargs,
  author = {Matthias Blume and Mike Rainey and John Reppy},
  title = {Calling Variadic Functions from a Strongly-typed Language},
  booktitle = {Proceedings of the 2008 {ACM} {SIGPLAN} Workshop on {ML}},
  year = 2008,
  month = sep,
  pages = {47-58},
  pdf = {2008/ml-varargs.pdf},
  abstract = {
    The importance of providing a mechanism to call C functions from high-level languages
    has been understood for many years and, these days, almost all statically-typed high-level-language
    implementations provide such a mechanism.
    One glaring omission, however, has been support for calling variadic C functions, such as \texttt{printf}.
    Variadic functions have been ignored because it is not obvious how to give static types to them and
    because it is not clear how to generate calling sequence when the arguments to the function may not be
    known until runtime.
    In this paper, we address this longstanding omission with an extension to the NLFFI foreign-interface
    framework used by Standard ML of New Jersey (SML/NJ) and the MLton SML compiler.
    We describe two different ways of typing variadic functions in NLFFI and an implementation technique
    based on the idea of using state machines to describe calling conventions.
    Our implementation is easily retargeted to new architectures and ABIs, and can also be easily added
    to any HOT language implementation that supports calling C functions.
  },
  topic = {interop,implementation,sml}
}

@inproceedings{implicit-threaded-parallel,
  author = {Matthew Fluet and Mike Rainey and John Reppy and Adam Shaw},
  title = {Implicitly-threaded parallelism in Manticore},
  booktitle = {Proceedings of the 13th {ACM} {SIGPLAN} International
    Conference on Functional Programming (ICFP 2008)},
  year = 2008,
  month = sep,
  pages = {119-130},
  pdf = {2008/icfp-datapar.pdf},
  abstract = {
    The increasing availability of commodity multicore processors is making
    parallel computing available to the masses.  Traditional parallel
    languages are largely intended for large-scale scientific computing
    and tend not to be well-suited to programming the applications one
    typically finds on a desktop system.  Thus we need new parallel-language
    designs that address a broader spectrum of applications.  In this
    paper, we present Manticore, a language for building parallel applications
    on commodity multicore hardware including a diverse collection of
    parallel constructs for different granularities of work.  We focus on
    the implicitly-threaded parallel constructs in our high-level functional
    language.  We concentrate on those elements that distinguish our design
    from related ones, namely, a novel parallel binding form, a nondeterministic
    parallel case form, and exceptions in the presence of data parallelism.
    These features differentiate the present work from related work on
    functional data parallel language designs, which has focused largely
    on parallel problems with regular structure and the compiler transformations
    --- most notably, flattening --- that make such designs feasible.  We
    describe our implementation strategies and present some detailed
    examples utilizing various mechanisms of our language.
  },
  topic = {design,concurrency,manticore}
}

@inproceedings{manticore-sched-framework,
  author = {Matthew Fluet and Mike Rainey and John Reppy},
  title = {A scheduling framework for general-purpose parallel languages},
  booktitle = {Proceedings of the 13th {ACM} {SIGPLAN} International
    Conference on Functional Programming (ICFP 2008)},
  year = 2008,
  month = sep,
  pages = {241-252},
  pdf = {2008/icfp-schedule.pdf},
  abstract = {
    The trend in microprocessor design toward multicore and manycore processors
    means that future performance gains in software will largely come from harnessing
    parallelism.
    To realize such gains, we need languages and implementations that can enable
    parallelism at many different levels.
    For example, an application might use both explicit threads to implement course-grain
    parallelism for independent tasks and implicit threads for fine-grain data-parallel
    computation over a large array.
    An important aspect of this requirement is supporting a wide range of different
    scheduling mechanisms for parallel computation.

    In this paper, we describe the scheduling framework that we have designed
    and implemented for Manticore, a strict parallel functional language.
    We take a micro-kernel approach in our design: the compiler and runtime support
    a small collection of scheduling primitives upon which complex scheduling policies
    can be implemented.
    This framework is extremely flexible and can support a wide range of
    different scheduling policies.
    It also supports the  nesting of schedulers, which is key to both supporting multiple
    scheduling policies in the same application and to hierarchies of
    speculative parallel computations.

    In addition to describing our framework, we also illustrate its expressiveness
    with several popular scheduling techniques.
    We present a (mostly) modular approach to extending our schedulers to support cancellation.
    This mechanism is essential for implementing eager and speculative parallelism.
    We finally evaluate our framework with a series of benchmarks and an analysis.
  },
  topic = {implementation,concurrency,manticore}
}

@inproceedings{toward-parallel-cml,
  author = {John Reppy and Yingqi Xiao},
  title = {Toward a parallel implementation of Concurrent ML},
  booktitle = {Proceedings of the Workshop on Declarative Aspects of
  	Multicore Programming (DAMP 2008)},
  year = 2008,
  month = jan,
  pdf = {2008/damp-parallel-cml.pdf},
  abstract = {
    Concurrent ML (CML) is a high-level message-passing language that supports
    the construction of first-class synchronous abstractions called
    events.  This mechanism has proven quite effective over the years and
    has been incorporated in a number of other languages.  While CML
    provides a concurrent programming model, its implementation has always
    been limited to uniprocessors.  This limitation is exploited in the
    implementation of the synchronization protocol that underlies the event
    mechanism, but with the advent of cheap parallel processing on the
    desktop (and laptop), it is time for Parallel CML.

    We are pursuing such an implementation as part of the Manticore
    project.  In this paper, we describe a parallel implementation of
    Asymmetric CML (ACML), which is a subset of CML that does not
    support output guards.  We describe an optimistic concurrency
    protocol for implementing CML synchronization.  This protocol has
    been implemented as part of the Manticore system.
  },
  topic = {implementation,cml,concurrency,manticore}
}

@inproceedings{manticore-status-2007,
  author = {Matthew Fluet and Nic Ford and Mike Rainey and John Reppy and Adam Shaw and Yingqi Xiao},
  topic = {design,implementation,concurrency},
  title = {Status Report: The {Manticore} Project},
  booktitle = {Proceedings of the 2007 {ACM} {SIGPLAN} Workshop on {ML}},
  year = 2007,
  month = oct,
  pages = {15-24},
  pdf = {2007/ml-manticore.pdf},
  abstract = {
    The Manticore project is an effort to design and implement a new
    functional language for parallel programming.
    Unlike many earlier parallel languages, Manticore is a \emph{heterogeneous}
    language that supports parallelism at multiple levels.
    Specifically, we combine CML-style explicit concurrency with fine-grain,
    implicitly threaded, parallel constructs.
    We have been working on an implementation of Manticore for the past six months;
    this paper gives an overview of our design and a report on the status of
    the implementation effort.
  },
  topic = {design,implementation,concurrency,manticore}
}

@inproceedings{trait-metaprogramming,
  author = {John Reppy and Aaron Turon},
  title = {Metaprogramming with Traits},
  booktitle = {Proceedings of the European Conference
	on Object Oriented Programming (ECOOP 2007)},
  year = 2007,
  month = {July-August},
  pages = {373-398},
  pdf = {2007/ecoop-traits.pdf},
  abstract = {
    In many domains, classes have highly regular internal structure.
    For example, so-called business objects often contain boilerplate code for mapping
    database fields to class members.
    The boilerplate code must be repeated per-field for every class, because
    existing mechanisms for constructing classes do not provide a way to
    capture and reuse such member-level structure.
    As a result, programmers often resort to \emph{ad hoc} code generation.
    This paper presents a lightweight mechanism for specifying and reusing member-level
    structure in Java programs.
    The proposal is based on a modest extension to traits
    that we have termed \emph{trait-based metaprogramming}.
    Although the semantics of the mechanism are straightforward, its type
    theory is difficult to reconcile with nominal subtyping.
    We achieve reconciliation by introducing a hybrid structural/nominal type
    system that extends Java's type system.
    The paper includes a formal calculus defined by translation to Featherweight Generic Java.
  },
  topic = {design,object}
}

@inproceedings{specialize-cml,
  author = {John Reppy and Yingqi Xiao},
  title = {Specialization of CML message-passing primitives},
  booktitle = {Proceedings of the 34th {ACM} {SIGPLAN}-{SIGACT} Symposium
			on Principles of Programming Languages ({POPL} 2007)},
  year = 2007,
  month = jan,
  pages = {315-326},
  doi = {10.1145/1190216.1190264},
  pdf = {2007/popl-cml-opt.pdf},
  abstract = {
    Concurrent ML (CML) is a statically-typed higher-order concurrent language
    that is embedded in Standard ML.
    Its most notable feature is its support for
    \emph{first-class synchronous operations}.
    This mechanism allows programmers to encapsulate complicated communication
    and synchronization protocols as first-class abstractions, which
    encourages a modular style of programming where the
    underlying channels used to communicate with a given thread are
    hidden behind data and type abstraction.

    While CML has been in active use for well over a decade, little attention
    has been paid to optimizing CML programs.
    In this paper, we present a new program analysis for statically-typed
    higher-order concurrent languages that enables the compile-time specialization
    of communication operations.
    This specialization is particularly important in a multiprocessor or
    multicore setting, where the synchronization overhead for general-purpose
    operations are high.
    Preliminary results from a prototype that we have built demonstrate that
    specialized channel operations are much faster than the general-purpose
    operations.

    Our analysis technique is modular (i.e., it analyzes and optimizes a single
    unit of abstraction at a time), which plays to the modular style of
    many CML programs.
    The analysis consists of three steps: the first is a
    type-sensitive control-flow analysis that uses the program's
    type-abstractions to compute more precise results.
    The second is the construction of an \emph{extended control-flow graph}
    using the results of the CFA.
    The last step is an iterative analysis over the graph that approximates
    the usage patterns of known channels.
    Our analysis is designed to detect special patterns of use, such
    as one-shot channels, fan-in channels, and fan-out channels.
    We have proven the safety of our analysis and state those results.
  },
  topic = {implementation,cml,concurrency,manticore}
}

@inproceedings{manticore,
  author = {Matthew Fluet and  Mike Rainey and John Reppy and Adam Shaw and Yingqi Xiao},
  title = {Manticore: A heterogeneous parallel language},
  booktitle = {Proceedings of the Workshop on Declarative Aspects of
  	Multicore Programming (DAMP 2007)},
  year = 2007,
  month = jan,
  pages = {37-44},
  pdf = {2007/damp-manticore.pdf},
  abstract = {
	The Manticore project is an effort to design and implement a new
	functional language for parallel programming.
	Unlike many earlier parallel languages, Manticore is a \emph{heterogeneous}
	language that supports parallelism at multiple levels.
	Specifically, we combine CML-style explicit concurrency with NESL/Nepal-style
	data-parallelism.
	In this paper, we describe and motivate the design of the Manticore language.
	We also describe a flexible runtime model that supports multiple scheduling
	disciplines (\emph{e.g.}, for both fine-grain and course-grain parallelism) in
	a uniform framework.
	Work on a prototype implementation is ongoing and we give a status report.
  },
  topic = {design,implementation,concurrency,manticore}
}

@inproceedings{app-spec-fig,
  author = {John Reppy and Chunyan Song},
  title = {Application-specific foreign-interface generation},
  booktitle = {Proceedings of the Fifth International Conference on
			Generative Programming and Component Engineering},
  pages = {49-58},
  year = 2006,
  month = oct,
  pdf = {2006/gpce-fig.pdf},
  abstract = {
    A foreign interface (FI) mechanism to support interoperability with
    libraries written in other languages (especially C) is an important
    feature in most high-level language implementations.  Such FI
    mechanisms provide a Foreign Function Interface (FFI) for the
    high-level language to call C functions and marshaling and
    unmarshaling mechanisms to support conversion between the high-level
    and C data representations.  Often, systems provide tools to automate
    the generation of FIs, but these tools typically lock the user into a
    specific model of interoperability.  It is our belief that the policy
    used to craft the mapping between the high-level language and C should
    be distinct from the underlying mechanism used to implement the
    mapping.

    In this paper, we describe a FI generation tool, called FIG (for
    Foreign Interface Generator) that embodies a new approach to the
    problem of generating foreign interfaces for high-level languages.
    FIG takes as input raw C header files plus a declarative script that
    specifies the generation of the foreign interface from the header
    file.  The script sets the policy for the translation, which allows
    the user to tailor the resulting FI to his or her application.  We
    call this approach application-specific foreign-interface generation.
    The scripting language uses rewriting strategies as its execution
    model.  The other major feature of the scripting language is a novel
    notion of composable typemaps that describe the mapping between
    high-level and low-level types.
  },
  topic = {interop,moby}
}

@inproceedings{typed-cfa,
  author = {John Reppy},
  title = {Type-sensitive control-flow analysis},
  booktitle = {Proceedings of the 2006 {ACM} {SIGPLAN} Workshop on {ML}},
  year = 2006,
  month = sep,
  pages = {74--83},
  doi = {10.1145/1159876.1159888},
  pdf = {2006/ml-typed-cfa.pdf},
  abstract = {
    Higher-order typed languages, such as ML, provide strong
    support for data and type abstraction.  While such abstraction
    is often viewed as costing performance, there are situations
    where it may provide opportunities for more aggressive program
    optimization. Specifically, we can exploit the fact that type
    abstraction guarantees representation independence, which allows
    the compiler to specialize data representations.  This paper
    describes a first step in supporting such optimizations; namely
    a control-flow analysis that uses the program's type information
    to compute more precise results.  We present our algorithm as an
    extension of Serrano's version of 0-CFA and we show that it
    respects types.  We also discuss applications of the analysis with
    examples of optimizations enabled by the analysis that would not
    be possible normal CFA.
  },
  topic = {implementation,sml}
}

@inproceedings{trait-metaprogramming-foundation,
  author = {John Reppy and Aaron Turon},
  title = {A Foundation for Trait-based Metaprogramming},
  booktitle = {2006 International Workshop on Foundations and Developments
			of Object-Oriented Languages},
  year = 2006,
  month = jan,
  pdf = {2006/foolwood-traits.pdf},
  abstract = {
    Scharli et al. introduced traits as reusable units of behavior
    independent of the inheritance hierarchy.  Despite their relative
    simplicity, traits offer a surprisingly rich calculus.  Trait calculi
    typically include operations for resolving conflicts when composing
    two traits.  In the existing work on traits, these operations (method
    exclusion and aliasing) are \emph{shallow}, i.e., they have no effect
    on the body of the other methods in the trait.  In this paper, we present
    a new trait system, based on the Fisher-Reppy trait calculus, that adds
    deep operations (method hiding and renaming) to support conflict resolution.
    The proposed operations are \emph{deep} in the sense that they preserve any existing
    connections between the affected method and the other methods of the
    trait.  Our system uses Riecke-Stone dictionaries to support these features.
    In addition, we define a more fine-grained mechanism for tracking
    trait types than in previous systems. The resulting calculus is more flexible
    and expressive, and can serve as the foundation for trait-based
    metaprogramming, an idiom we introduce.  A companion technical report proves
    type soundness for our system; we state the key results in this paper.
  },
  topic = {object}
}

@book{sml-basis-library,
  editor = {Emden R. Gansner and John H. Reppy},
  title = {The {Standard} {ML} Basis Library},
  publisher = {Cambridge University Press},
  year = 2004,
  url = {http://www.standardml.org/Basis/},
  topic = {design,sml}
}

@inproceedings{typed-traits:fool,
  author = {Kathleen Fisher and John Reppy},
  title = {A Typed Calculus of Traits},
  booktitle = {Proceedings of the 11th Workshop on Foundations of
					Object-oriented Programming},
  year = {2004},
  month = jan,
  pdf = {2004/fool-traits.pdf},
  topic = {object}
}

@techreport{moby-oop-tr,
  author = {Kathleen Fisher and John Reppy},
  title = {Object-oriented aspects of {{Moby}}},
  institution = {Department of Computer Science, University of Chicago},
  address = {Chicago, IL},
  number = {TR-2003-10},
  year = 2003,
  month = sep,
  topic = {design,moby,object}
}

@techreport{typed-traits-tr,
  author = {Kathleen Fisher and John Reppy},
  title = {Statically Typed Traits},
  institution = {Department of Computer Science, University of Chicago},
  address = {Chicago, IL},
  number = {TR-2003-13},
  year = 2003,
  month = dec,
  topic = {object}
}

@article{reppy:nested-loops,
  author = {John Reppy},
  title = {Optimizing Nested Loops Using Local {CPS} Conversion},
  journal = {Higher-order and Symbolic Computation},
  publisher = {Kluwer Academic Publishers},
  address = {Boston, MA},
  volume = 15,
  number = {2/3},
  year = 2002,
  month = sep,
  pages = {161-180},
  pdf = {2002/hosc-lcps.pdf},
  topic = {moby,implementation}
}

@article{class-types,
  author = {Kathleen Fisher and John Reppy},
  title = {Inheritance-based subtyping},
  journal = {Information and Computation},
  publisher = {Academic Press, Inc.},
  address = {London, UK},
  volume = 177,
  number = 1,
  year = 2002,
  month = aug,
  pages = {28-55},
  pdf = {2002/inc-inher-subty.pdf},
  abstract = {
    Classes play a dual role in mainstream statically-typed
    object-oriented languages, serving as both object generators and
    object types.  In such languages, inheritance implies subtyping.
    In contrast, the theoretical language community has viewed this
    linkage as a mistake and has focused on subtyping relationships
    determined by the structure of object types, without regard to their
    underlying implementations.  In this paper, we explore why
    inheritance-based subtyping relations are useful, and we describe
    two different approaches to extending the Moby programming language
    with inheritance-based subtyping relations.  In addition, we present
    a typed object calculus that supports both structural and
    inheritance-based subtyping, and which provides a formal accounting
    of our extensions to Moby.
  },
  topic = {design,moby,object}
}

@techreport{tm-moby-concur,
  author = {Kathleen Fisher and John Reppy},
  title = {Compiler support for lightweight concurrency},
  type = {Technical Memorandum},
  institution = {Bell Labs},
  year = {2002},
  month = mar,
  pdf = {2002/tm-lightweight-concur.pdf},
  topic = {moby,concurrency,implementation}
}

@inproceedings{moby-interop-framework,
  author = {Kathleen Fisher and Riccardo Pucella and John Reppy},
  title = {A framework for interoperability},
  booktitle = {Proceedings of the First International
					 Workshop on Multi-Language Infrastructure
					 and Interoperability (BABEL'01)},
  series = {Electronic Notes in Theoretical Computer Science},
  volume = {59},
  issue = {1},
  publisher = {Elsevier Science Publishers},
  address = {New York, NY},
  editor = {Nick Benton and Andrew Kennedy},
  year = {2001},
  month = sep,
  pdf = {2001/babel-interop.pdf},
  abstract = {
    Practical implementations of high-level languages must provide access to libraries and system
    services that have APIs specified in a low-level language (usually C).  An important characteristic
    of such mechanisms is the foreign-interface policy that defines how to bridge the semantic gap
    between the high-level language and C. For example, IDL-based tools generate code to marshal
    data into and out of the high-level representation according to user annotations. The design
    space of foreign-interface policies is large and there are pros and cons to each approach. Rather
    than commit to a particular policy, we choose to focus on the problem of supporting a gamut of
    interoperability policies. In this paper, we describe a framework for language interoperability
    that is expressive enough to support very efficient implementations of a wide range of different
    foreign-interface policies. We describe two tools that implement substantially different policies
    on top of our framework and present benchmarks that demonstrate their efficiency.
  },
  topic = {moby,interop}
}

@inproceedings{async-exn,
  author = {Simon Marlow and Simon Peyton Jones and Andrew Moran and John Reppy},
  title = {Asynchronous exceptions in {Haskell}},
  booktitle = {Proceedings of the SIGPLAN 2001 Conference on Programming
					Language Design and Implementation},
  year = 2001,
  month = jun,
  topic = {design,concurrency}
}

@inproceedings{reppy:lcps,
  author = {John Reppy},
  title = {Local {CPS} conversion in a direct-style compiler},
  booktitle = {Proceedings of the Third {ACM} {SIGPLAN}
					Workshop on Continuations (CW'01)},
  year = 2001,
  month = jan,
  pages = {13-22},
  pdf = {2001/cw-lcps.pdf},
  topic = {moby,implementation}
}

@inproceedings{hotos-protium,
  author = {Cliff Young and Lakshman Y.N. and Tom Szymanski and John Reppy and David Presotto and
	Rob Pike and Girija Narlikar and Sape Mullender and Eric Grosse},
  title = {Protium: An infrastructure for partitioned applications},
  booktitle = {Proceedings of the Eighth Workshop on Hot Operating Systems (HotOS-VIII)},
  year = 2001,
  month = jan,
  pdf = {2001/hotos-final.pdf},
  topic = {concurrency}
}

@inproceedings{ecoop00:class-types,
  author = {Kathleen Fisher and John Reppy},
  title = {Extending {Moby} with inheritance-based subtyping},
  booktitle = {Proceedings of the European Conference
	on Object Oriented Programming},
  year = 2000,
  month = jun,
  series = {Lecture Notes in Computer Science},
  volume = 1850,
  publisher = {Springer-Verlag},
  address = {New York, NY},
  pages = {83-107},
  pdf = {2000/ecoop-inher-subty.pdf},
  topic = {design,moby,object}
}

@inproceedings{esop00:links,
  author = {Kathleen Fisher and John Reppy and Jon Riecke},
  title = {A Calculus for Compiling and Linking Classes},
  booktitle = {Proceedings of the European Symposium on Programming},
  year = {2000},
  month = {March/April},
  series = {Lecture Notes in Computer Science},
  volume = 1782,
  publisher = {Springer-Verlag},
  address = {New York, NY},
  pages = {134-149},
  pdf = {2000/esop-links.pdf},
  topic = {object,implementation}
}

@inproceedings{inheritance-subtyping,
  author = {Kathleen Fisher and John Reppy},
  title = {Inheritance-based subtyping},
  booktitle = {Proceedings of the 7th Workshop on Foundations of
					Object-oriented Programming},
  year = 2000,
  month = jan,
  pdf = {2000/fool-inher-subty.pdf},
  topic = {design,moby,object}
}

@techreport{data-level-interop,
  author = {Kathleen Fisher and Riccardo Pucella and John Reppy},
  title = {Data-level interoperability},
  type = {{Technical} {Memorandum}},
  institution = {Bell Labs, Lucent Technologies},
  year = 2000,
  month = apr,
  topic = {moby,implementation,interop}
}

@book{reppy:cml-book,
  author = {John H. Reppy},
  title = {Concurrent Programming in {ML}},
  publisher = {Cambridge University Press},
  address = {Cambridge, England},
  year = 1999,
  topic = {concurrency,cml,sml}
}

@inproceedings{moby-classes,
  author = {Kathleen Fisher and John Reppy},
  title = {The design of a class mechanism for {Moby}},
  booktitle = {Proceedings of the SIGPLAN 1999 Conference on Programming
					Language Design and Implementation},
  publisher = {ACM},
  address = {New York, NY},
  year = 1999,
  month = may,
  pages = {37-49},
  topic = {design,moby,object}
}

@techreport{moby-foundations,
  author = {Kathleen Fisher and John Reppy},
  title = {Foundations for {Moby} classes},
  type = {{Technical} {Memorandum}},
  institution = {Bell Labs, Lucent Technologies},
  address = {Murray Hill, NJ},
  year = 1999,
  month = feb,
  topic = {moby,object}
}

@incollection{cml-essence,
  title = {The Essence of Concurrent ML},
  author = {Prakash Panangaden and John Reppy},
  chapter = {1},
  booktitle = {{ML} with Concurrency},
  publisher = {Springer-Verlag},
  editor = {Flemming Nielson},
  year = 1997,
  topic = {concurrency,cml}
}

@article{njc-aml,
  title = {{AML}: Attribute Grammars in {ML}},
  author = {S.G. Efremidis and K.A. Mughal and L. {S\o{}raas} and John Reppy},
  journal = {Nordic Journal of Computing},
  volume = 4,
  number = 1,
  year = 1997,
  topic = {sml,gui}
}

@inproceedings{oml-classes,
  title = {Classes in {Object} {ML} via modules},
  author = {John H. Reppy and Jon G. Riecke},
  booktitle = {Proceedings of the Third Workshop on Foundations of
					Object-oriented Programming},
  year = {1996},
  month = jul,
  topic = {design,object}
}

@inproceedings{simple-objects-for-sml,
  title = {Simple objects for {SML}},
  author = {John H. Reppy and Jon G. Riecke},
  booktitle = {Proceedings of the SIGPLAN 1996 Conference on Programming
					Language Design and Implementation},
  year = 1996,
  month = may,
  publisher = {ACM},
  address = {New York, NY},
  pages = {171-180},
  topic = {design,object}
}

@article{tolplas-olden,
  author = {Martin Carlisle and Laurie J. Hendren and Anne Rogers and John Reppy},
  title = {Supporting {SPMD} Execution for Dynamic Data Structures},
  journal = {ACM Transactions on Programming Languages and Systems},
  volume = 17,
  number = 2,
  year = 1995,
  month = mar,
  pages = {233-263},
  topic = {concurrency,implementation}
}

@inproceedings{list-unroll,
  author = {Zhong Shao and John Reppy and Andrew Appel},
  title = {Unrolling lists},
  booktitle = {ACM Conference on Lisp and Functional Programming},
  year = 1994,
  month = jun,
  pages = {185-195},
  topic = {sml,implementation}
}

@inproceedings{mlrisc,
  author = {Lal George and Florent Guillame and John Reppy},
  title = {A portable and optimizing back end for the {SML/NJ} compiler},
  booktitle = {Fifth International Conference on Compiler Construction},
  year = 1994,
  month = apr,
  pages = {83-97},
  topic = {sml,implementation}
}

@inproceedings{olden-experience,
  title = {Early experiences with {Olden}},
  author = {Martin Carlisle and Anne Rogers and John Reppy and Laurie Hendren},
  booktitle = {6th International Workshop on Languages and Compilers for Parallel Computing},
  series = {Lecture Notes in Computer Science},
  number = 768,
  year = 1993,
  month = aug,
  topic = {concurrency,implementation}
}

@incollection{multithreaded-windows,
  author = {Emden R. Gansner and John H. Reppy},
  title = {A Multi-threaded Higher-order User Interface Toolkit},
  booktitle = {User Interface Software, Bass and Dewan (Eds.)},
  series = {Software Trends},
  volume = 1,
  publisher = {John Wiley \& Sons},
  year = 1993,
  pages = {61-80},
  pdf = {1993/trends-eXene.pdf},
  topic = {gui,cml,concurrency}
}

@incollection{cml-design-etc,
  author = {John H. Reppy},
  title = {Concurrent {ML}: Design, application and semantics},
  booktitle = {Functional Programming, Concurrency, Simulation and Automated Reasoning},
  editor = {Peter Lauer},
  series = {Lecture Notes in Computer Science},
  number = 693,
  publisher = {Springer-Verlag},
  address = {New York, NY},
  year = 1993,
  topic = {cml,concurrency}
}

@techreport{tm-gc,
  author = {John H. Reppy},
  title = {A High-performance Garbage Collector for Standard {ML}},
  type = {Technical Memo},
  institution = {AT\&T Bell Laboratories},
  year = 1993,
  month = dec,
  pdf = {1993/tm-gc.pdf},
  topic = {implementation,sml}
}

@inproceedings{spmd-dynamic-data,
  title = {Supporting {SPMD} Execution for Dynamic Data Structures},
  author = {Anne Rogers and John Reppy and Laurie Hendren},
  booktitle = {5th International Workshop on Languages and Compilers for Parallel Computing},
  series = {Lecture Notes in Computer Science},
  number = 757,
  year = 1992,
  month = aug,
  topic = {concurrency,implementation}
}

@techreport{absconst-for-sml,
  title = {Abstract Value Constructors: Symbolic Constants for Standard {ML}},
  author = {William E. Aitken and John H. Reppy},
  institution = {Department of Computer Science, Cornell University},
  number = {TR 92-1290},
  year = 1992,
  month = jun,
  note = {A shorter version appears in the proceedings of the ``ACM SIGPLAN
		Workshop on ML and its Applications,'' 1992},
  abstract = {
    Standard ML (SML) has been used to implement a wide variety of large
    systems, such as compilers, theorem provers, graphics libraries, and
    even operating systems.  While SML provides a
    convenient, high-level notation for programming large applications,
    it does have certain deficiencies. One such deficiency is the
    lack of a general
    mechanism for assigning symbolic names to constant values.
    In this paper, we present a simple extension of SML that corrects this
    deficiency in a way that fits naturally with the semantics of SML.
    Our proposal is a generalization of SML's datatype constructors:
    we introduce {\em constants} that generalize nullary
    datatype constructors (like \texttt{nil}), and {\em templates} that generalize
    non-nullary datatype constructors (like \texttt{::}).
    Constants are identifiers bound to fixed values, and
    templates are identifiers bound to structured values with labeled
    holes.  Templates are useful because they allow users to
    treat the representation of structured data abstractly without
    having to give up pattern matching.
  },
  pdf = {1992/tr-sml-const.pdf},
  topic = {design,sml}
}

@inproceedings{aml,
  title = {Attribute grammars in {ML}},
  author = {S.G. Efremidis and K.A. Mughal and John Reppy},
  booktitle = {{ACM} {SIGPLAN} Workshop on {ML} and its Aplications},
  year = 1992,
  month = jun,
  topic = {sml,gui}
}

@phdthesis{reppy:phd,
  author = {John H. Reppy},
  title = {Higher-order Concurrency},
  school = {Cornell University},
  year = 1992,
  note = {Available as Computer Science Technical Report 92-1285},
  topic = {concurrency, implementation, sml, cml}
}

@incollection{gui-foundation,
  author = {Emden R. Gansner and John H. Reppy},
  title = {A Foundation for User Interface Construction},
  chapter = {14},
  editor = {Brad Myers},
  booktitle = {Languages for Developing User Interfaces},
  publisher = {Jones and Bartlett},
  year = 1992,
  topic = {concurrency, gui}
}

@inproceedings{eXene,
  author = {Emden R. Gansner and John H. Reppy},
  title = {e{X}ene},
  booktitle = {Third International Workshop on {ML}},
  year = 1991,
  month = sep,
  pdf = {1991/ml-exene.pdf},
  topic = {cml,concurrency,gui}
}

@inproceedings{reppy:cml,
  author = {John H. Reppy},
  title = {{CML}: A higher-order concurrent language},
  booktitle = {Proceedings of the SIGPLAN 1991 Conference on Programming
					Language Design and Implementation},
  publisher = {ACM},
  address = {New York, NY},
  year = 1991,
  month = jun,
  pages = {293-305},
  pdf = {1991/pldi-cml.pdf},
  topic = {concurrency, implementation, sml, cml}
}

@techreport{reppy:signals,
  author = {John H. Reppy},
  title = {Asynchronous signals in {Standard ML}},
  number = {TR 90-1144},
  institution = {Department of Computer Science, Cornell University},
  address = {Ithaca, NY},
  year = 1990,
  month = aug,
  pdf = {1990/cornell-tr-1144.pdf},
  topic = {concurrency, implementation, sml}
}

@inproceedings{pldi-pml,
  author = {John H. Reppy},
  title = {Synchronous Operations as First-class Values},
  booktitle = {Proceedings of the SIGPLAN 1988 Conference on Programming
					Language Design and Implementation},
  year = 1988,
  month = jun,
  topic = {concurrency, design}
}

@inproceedings{pegasus-gc,
  author = {Steven C. North and John H. Reppy},
  title = {Concurrent Garbage Collection on Stock Hardware},
  booktitle = {Third International Conference on Functional
	Programming Languages and Computer Architecture},
  year = 1987,
  month = sep,
  series = {Lecture Notes in Computer Science},
  volume = 274,
  publisher = {Springer-Verlag},
  address = {New York, NY},
  pages = {113-133},
  topic = {implementation,concurrency}
}

@inproceedings{pegasus,
  author = {John H. Reppy and Emden R. Gansner},
  title = {A foundation for programming environments},
  booktitle = {Proceedings of the {ACM} {SIGSOFT/SIGPLAN} Software
	Engineering Symposium on Practical Software Development Environments},
  year = 1986,
  month = dec,
  pages = {218-227},
  topic = {gui}
}

This file was generated by bibtex2html 1.98.