Release v1.0.0-beta.2

coduin · Apr 30, 2016 · 9be368e · 9be368e
2 parents b2fb0d0 + 101e675
commit 9be368e
Show file tree

Hide file tree

Showing 51 changed files with 684 additions and 136 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## 1.0.0-beta.2 - 2016-04-21
+
+### Added
+- Add support for Epiphany SDK version 2016.3
+- Add debug target to build system
+
+### Fixed
+- Fix non-aligned transfers using `ebsp_memcpy` (@kmate)
+- Fix unnecessary arguments in interrupt handlers
+- Make target barrier volatile
+- Fix minor errors and inconsistencies in the documentation
+- Fix a race condition involving the DMA interrupt
+- Fix LU decomposition example by removing fast-math optimization flag
+
+### Removed
+- Remove srec support from examples and tests
+
 ## 1.0b - 2015-10-21
 
 ### Added

diff --git a/Makefile b/Makefile
@@ -27,12 +27,12 @@ E_ASM_SRCS = \
 		e_bsp_raw_time.s
 
 E_HEADERS = \
-		include/common.h \
+		include/ebsp_common.h \
 		include/e_bsp.h \
 		include/e_bsp_private.h
 
 HOST_HEADERS = \
-		include/common.h \
+		include/ebsp_common.h \
 		include/host_bsp.h \
 		include/host_bsp_private.h
 
@@ -51,7 +51,8 @@ INCLUDES = -I/usr/include/esdk \
 HOST_LIBS= -L${ESDK}/tools/host/lib \
 		   -le-hal
 
-E_FLAGS = -std=c99 -O3 -fno-strict-aliasing -ffast-math -fno-tree-loop-distribute-patterns -Wall -Wfatal-errors
+CCFLAGS = -std=c99 -O3 -Wall -Wfatal-errors
+EFLAGS = -std=c99 -O3 -fno-strict-aliasing -ffast-math -fno-tree-loop-distribute-patterns -Wall -Wfatal-errors
 
 E_OBJS = $(E_SRCS:%.c=bin/e/%.o) $(E_ASM_SRCS:%.s=bin/e/%.o)
 HOST_OBJS = $(HOST_SRCS:%.c=bin/host/%.o) 
@@ -64,25 +65,29 @@ vpath %.s src
 
 bin/host/%.o: %.c $(HOST_HEADERS)
 	@echo "CC $<"
-	@$(ARM_PLATFORM_PREFIX)gcc -O3 -Wall -Wfatal-errors -std=c99 $(INCLUDES) -c $< -o $@ ${HOST_LIBS}
+	@$(ARM_PLATFORM_PREFIX)gcc $(CCFLAGS) $(INCLUDES) -c $< -o $@ ${HOST_LIBS}
 
 # C code to object file
 bin/e/%.o: %.c $(E_HEADERS)
 	@echo "CC $<"
-	@$(E_PLATFORM_PREFIX)gcc $(E_FLAGS) $(INCLUDES) -c $< -o $@ -le-lib
+	@$(E_PLATFORM_PREFIX)gcc $(EFLAGS) $(INCLUDES) -c $< -o $@ -le-lib
 
 # Assembly to object file
 bin/e/%.o: %.s $(E_HEADERS)
 	@echo "CC $<"
-	@$(E_PLATFORM_PREFIX)gcc $(E_FLAGS) -c $< -o $@ -le-lib
+	@$(E_PLATFORM_PREFIX)gcc $(EFLAGS) -c $< -o $@ -le-lib
 
 # C code to assembly
 bin/e/%.s: %.c $(E_HEADERS)
 	@echo "CC $<"
-	@$(E_PLATFORM_PREFIX)gcc $(E_FLAGS) $(INCLUDES) -fverbose-asm -S $< -o $@
+	@$(E_PLATFORM_PREFIX)gcc $(EFLAGS) $(INCLUDES) -fverbose-asm -S $< -o $@
 
 all: host e
 
+debug: CCFLAGS += -DDEBUG -g
+debug: EFLAGS += -DDEBUG
+debug: host e
+
 host: host_dirs lib/$(HOST_LIBNAME)$(LIBEXT)
 
 e: e_dirs lib/$(E_LIBNAME)$(LIBEXT)

diff --git a/README.md b/README.md
@@ -130,6 +130,10 @@ If you are using EBSP, or have any questions, remarks or ideas then please get i
 - Abe Wits
 - Jan-Willem Buurlage.
 
+Also thanks to:
+
+- Máté Karácsony
+
 ## Issues
 
- The [issue tracker](https://github.com/coduin/epiphany-bsp/issues) is hosted on GitHub. We welcome pull requests, please pull request against the develop branch. Read [the GitHub flow guide](https://guides.github.com/introduction/flow/) for details.
+ The [issue tracker](https://github.com/coduin/epiphany-bsp/issues) is hosted on GitHub. We welcome pull requests, please pull request against the develop branch and add your name to the authors section of this README. Read [the GitHub flow guide](https://guides.github.com/introduction/flow/) for details.
diff --git a/docs/README.md b/docs/README.md
@@ -6,6 +6,7 @@ Generating the documentation pages requires the following programs and packages:
 
 - Sphinx (`python-sphinx` package)
 - Breathe (`breathe` python package. First install `python-pip` and run `sudo pip install breathe`)
+- sphinxcontrib-googleanalytics (`sudo pip install sphinxcontrib-googleanalytics`)
 - Doxygen (`doxygen` package)
 - `sphinx_rtd_theme` (comes with `python-sphinx`)
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -39,6 +39,7 @@
     'sphinx.ext.ifconfig',
     'sphinx.ext.viewcode',
     'breathe',
+    'sphinxcontrib.googleanalytics',
 ]
 
 # Breathe settings
@@ -81,6 +82,9 @@
 # The full version, including alpha/beta/rc tags.
 release = '1.0-beta'
 
+# google analytics ID
+googleanalytics_id = 'UA-59249373-1'
+
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #

diff --git a/docs/index.rst b/docs/index.rst
@@ -8,7 +8,7 @@
 Welcome to Epiphany BSP's documentation!
 ========================================
 
-Epiphany BSP is a library for developing applications for the `Parallella board <www.parallella.org>`_. It is easy to setup and use, and provides powerful mechanisms for writing optimized parallel programs.
+Epiphany BSP is a library for developing applications for the `Parallella board <https://www.parallella.org>`_. It is easy to setup and use, and provides powerful mechanisms for writing optimized parallel programs.
 
 This documentation provides an introduction to each component of the
 Epiphany BSP (EBSP) library. Each section introduces a number of new EBSP functions

diff --git a/docs/memory_details.rst b/docs/memory_details.rst
@@ -17,7 +17,7 @@ Memory Types
 ------------
 
 The Epiphany cores have access to two types of memory.
-Both types can be accessed directly (e.g. by dereferencing a pointer). Here we will give a short overview of these two types. For more details see the `Epiphany architecture reference <www.adapteva.com/docs/epiphany_arch_ref.pdf>`_.
+Both types can be accessed directly (e.g. by dereferencing a pointer). Here we will give a short overview of these two types. For more details see the `Epiphany architecture reference <http://www.adapteva.com/docs/epiphany_arch_ref.pdf>`_.
 
 All addresses shown below are the ones used by the Epiphany cores. They can **not** be used directly by the ARM processor.
 

diff --git a/docs/streaming.rst b/docs/streaming.rst
@@ -16,7 +16,7 @@ Making and using down streams
 
 There are two types of streams, *up* and *down* streams. A *down* stream contains data to be processed by an Epiphany core, while an *up* stream contains results from computations performed by the Epiphany core. Every stream (both up and down) has a *target processor*, *total size* and a *chunk size*. The target processor is simply the processor id of the core that should receive the content of the stream. The total size is the total number of bytes of the entire set of data. This set of data then gets partitioned into chunks consisting of the number of bytes set by the chunk size. This size need not be constant (i.e. it may vary over a single stream), but for our discussion here we will assume that it is constant.
 
-A stream is created before the call to ``ebsp_spmd`` on the host processor. The host prepares the data to be processed by the Epiphany cores, and the EBSP library then performs the necessary work needed for each core to receives its chunk. A stream is created as follows::
+A stream is created before the call to ``ebsp_spmd`` on the host processor. The host prepares the data to be processed by the Epiphany cores, and the EBSP library then performs the necessary work needed for each core to receives its chunk. Note that this data is copied efficiently to the external memory upon creation of the stream, so that the user data should be stored in the ordinary RAM, e.g. allocated by a call to ``malloc``. A stream is created as follows::
 
     // on the host
     int count = 256;
@@ -61,11 +61,11 @@ Moving results back up
 Up streams work very similar to down streams, however no data has to be supplied by the host since it is generated by the Epiphany. We construct an up stream in the following way::
 
     // on the host
-    // .. create down stream (see above)
+    // .. create up stream (see above)
     void* upstream_data = malloc(sizeof(void*) * bsp_nprocs());
     for (int s = 0; s < bsp_nprocs(); s++) {
-        upstream_data[s] = ebsp_create_down_stream(
-            s, count * sizeof(float), count_in_chunk * sizeof(float));
+        upstream_data[s] = ebsp_create_up_stream(
+            s, chunks * chunksize, chunks);
     }
 
 The array ``upstream_data`` holds pointers to the generated data by each processor. In the kernel you can *open* these streams similarly to down streams::

diff --git a/docs/variables.rst b/docs/variables.rst
@@ -11,7 +11,7 @@ BSP Variables
 Registering, putting and getting
 --------------------------------
 
-If we want to write more interesting EBSP programs, we need to have a way to communicate between the different Epiphany cores. In EBSP communication happens in one of two ways: using message passing, which we will introduce later, or via *registered variables*. An EBSP variable exists on every processor, but it does not have to have the same size on every Epiphany core.
+If we want to write more interesting EBSP programs, we need to have a way to communicate between the different Epiphany cores. In EBSP communication happens in one of two ways: using message passing, which we will introduce later, or via *registered variables*. An EBSP variable exists on every processor, but does not necessarily have the same size on every Epiphany core.
 
 Variable registration
 ^^^^^^^^^^^^^^^^^^^^^
@@ -22,14 +22,14 @@ We register a variable by calling ``bsp_push_reg``::
     bsp_push_reg(&a, sizeof(int));
     bsp_sync();
 
-Here we declare an integer ``a``, and initialize it with zero. Next we *register* the variable with BSP system, by passing its local location, and its size.
+Here we declare an integer ``a``, and initialize it with zero. Next we *register* the variable with the BSP system, by passing its local location, and its size.
 
-To ensure that all cores have registered a variable, we perform a barrier synchronisation after the registration. The Epiphany cores will halt execution until *every other core* reaches this point in the program, so it *synchronizes* the program execution between the Epiphany cores. Only *one variable may be declared between calls to ``bsp_sync``*!
+To ensure that all cores have registered a variable, we perform a barrier synchronisation after the registration. The Epiphany cores will halt execution until *every other core* reaches this point in the program, so it *synchronizes* the program execution between the Epiphany cores. Only *one variable may be declared between calls to* ``bsp_sync``!
 
 Putting and getting values
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Registered variables can be written to or read from by other cores. In BSP this is refered to as *putting* something in a variable, or *getting* the value of a variable. To write for example our processor ID to the *next core* we can write::
+Registered variables can be written to or be read from by other cores. In BSP this is referred to as *putting* something in a variable, or *getting* the value of a variable. To write for example our processor ID to the *next core* we can write::
 
     int b = s;
     bsp_put((s + 1) % p, &b, &a, 0, sizeof(int));
@@ -85,7 +85,7 @@ The arguments for ``bsp_get`` are:
 4. A pointer to the local destination.
 5. The number of bytes to copy.
 
-And again, we perform a barrier synchronisation to ensure the data has been transferred. If you are familiar with concurrent programming, then you might think we are at risk of a ``race condition <https://en.wikipedia.org/wiki/Race_condition>``_! What if processor ``s`` reaches the ``bsp_get`` statement before processor ``(s + 1) % p`` has set the value for ``a`` equal to its process number? Do we then obtain zero? In this case, we do not have to worry -- no data transfer is initialized until each core has reached ``bsp_sync``. Indeed we receive the correct output::
+And again, we perform a barrier synchronisation to ensure the data has been transferred. If you are familiar with concurrent programming, then you might think we are at risk of a `race condition <https://en.wikipedia.org/wiki/Race_condition>`_! What if processor ``s`` reaches the ``bsp_get`` statement before processor ``(s + 1) % p`` has set the value for ``a`` equal to its process number? Do we then obtain zero? In this case, we do not have to worry -- no data transfer is initialized until each core has reached ``bsp_sync``. Indeed we receive the correct output::
 
     $01: received: 2
     $03: received: 4
@@ -98,11 +98,11 @@ Unbuffered communication
 
 So far we have discussed writing to, and reading from variables using ``bsp_put`` and ``bsp_get``. These two functions are *buffered*. When calling ``bsp_put`` for example, the *current source value* at the time of the function call is guarenteed to be sent to the target processor, but it does not get sent until the next barrier synchronisation -- so behind the scenes the EBSP library stores a copy of the data. The BSP standard was originally designed for distributed memory systems with very high latency, in which this design makes a lot of sense. On the Epiphany platform this gives a lot of unnecessary overhead since data is copied to *external memory*.
 
-This problem is not unique to the Epiphany platform however. Together with the `MulticoreBSP <http://www.multicorebsp.com/>`_ which target modern multicore processors, two additional BSP primitives were introduced that provide *unbuffered* variable communication, ``bsp_hpput`` and ``bsp_hpget``. Here the ``hp...`` prefix stands for *high performance*.
+This problem is not unique to the Epiphany platform however. Together with the `MulticoreBSP <http://www.multicorebsp.com/>`_ which targets modern multicore processors, two additional BSP primitives were introduced that provide *unbuffered* variable communication, ``bsp_hpput`` and ``bsp_hpget``. Here the ``hp...`` prefix stands for *high performance*.
 
 However, although their function signatures are completely identical, these are not meant as a drop-in replacements for ``bsp_put`` and ``bsp_get``. They are unsafe in the sense that data transfer happens *at once*. This means that when using these functions you should be aware of possible race conditions -- which can notoriously lead to mistakes that can be very hard to debug.
 
-To facilitate writing code using only unbuffered communication we will expose an ``ebsp_barrier`` function in the next EBSP release that performs a barrier synchronisation without transferring any outstanding communication that has arisen from calls to ``bsp_put`` and ``bsp_get``. Let us look at an example program using these unbuffered variants.::
+To facilitate writing code using only unbuffered communication we introduce a ``ebsp_barrier`` function that performs a barrier synchronisation without transferring any outstanding communication that has arisen from calls to ``bsp_put`` and ``bsp_get``. Let us look at an example program using these unbuffered variants::
 
     int s = bsp_pid();
     int p = bsp_nprocs();
@@ -113,15 +113,15 @@ To facilitate writing code using only unbuffered communication we will expose an
 
     int b = s;
     // barrier ensures b has been written to on each core
-    bsp_sync();
+    ebsp_barrier();
 
     bsp_hpput((s + 1) % p, &b, &a, 0, sizeof(int));
 
     // barrier ensures data has been received
     bsp_sync();
     ebsp_message("received: %i", a);
 
-When writing or reading large amounts of data in between different ``bsp_sync`` calls, the ``hp...`` functions are much more efficient in terms of local memory usage (which is very valuable because of the small size) as well as running speed. However, extra care is needed to effectively synchronize between threads. For example, if we remove any of the two ``bsp_sync`` calls in the previous example program, there will be a race condition.
+When writing or reading large amounts of data in between different ``bsp_sync`` calls, the ``hp...`` functions are much more efficient in terms of local memory usage (which is very valuable because of the small size) as well as running speed. However, extra care is needed to effectively synchronize between threads. For example, if we remove either the ``ebsp_barrier``, or the  ``bsp_sync`` calls in the previous example program, there will be a race condition.
 
 We test the program, and see that the output is indeed identical to before::
 

diff --git a/ebsp_fast.ldf b/ebsp_fast.ldf
@@ -52,7 +52,7 @@ MEMORY
 	IVT_RAM (WXAI) : ORIGIN = 0, LENGTH = 0x28
 
 	/* user program, continuous placement */
-	WORKGROUP_RAM (WXAI)     : ORIGIN = LENGTH(IVT_RAM), LENGTH = 0x30
+	WORKGROUP_RAM (WXAI)     : ORIGIN = LENGTH(IVT_RAM), LENGTH = 0x100 - LENGTH(IVT_RAM)
 
 	/* user program, continuous placement */
 	INTERNAL_RAM (WXAI)      : ORIGIN = LENGTH(IVT_RAM) + LENGTH(WORKGROUP_RAM), LENGTH = 32K - LENGTH(IVT_RAM) - LENGTH(WORKGROUP_RAM)
@@ -97,9 +97,31 @@ SECTIONS
 	ivt_dma1                0x1c : {*.o(ivt_entry_dma1)}               > IVT_RAM
 	ivt_wand                0x20 : {*.o(ivt_entry_wand)}               > IVT_RAM
 	ivt_user                0x24 : {*.o(ivt_entry_user)}               > IVT_RAM
-
-	workgroup_config        0x28 : {*.o(workgroup_cfg)}                > WORKGROUP_RAM /* 28 2c 30 34 38 3c 40 44 48 (4c) */
-	external_mem_config     0x50 : {*.o(ext_mem_cfg)}                  > WORKGROUP_RAM /* 50 54 */
+
+	/* section filled in by loader */
+	workgroup_cfg           0x28 :
+	  {
+	    *(workgroup_cfg)
+	    ASSERT(. <= 0x28, "workgroup_cfg section overflow");
+	    . = 0x28; /* force allocation */
+	  } > WORKGROUP_RAM /* 28 2c 30 34 38 3c 40 44 48 (4c) */
+
+	/* section filled in by loader */
+	ext_mem_cfg             0x50 :
+	  {
+	    *(ext_mem_cfg);
+	    ASSERT(. <= 0x8, "ext_mem_cfg section overflow");
+	    . = 0x8; /* force allocation */
+	  } > WORKGROUP_RAM /* 50 54 */
+
+	/* section filled in by loader */
+	loader_cfg             0x58 :
+	  {
+	    *(loader_cfg);
+	    ASSERT(. <= 0x10, "loader_cfg section overflow");
+	    . = 0x10; /* force allocation */
+	  } > WORKGROUP_RAM /* 58 5C 60 64 */
+
 
 	/* place the ISR handlers after workgroup-configuration */
 	.reserved_crt0  ORIGIN(IVT_RAM) + LENGTH(IVT_RAM) + LENGTH(WORKGROUP_RAM) : {*.o(RESERVED_CRT0) *.o(reserved_crt0)} > INTERNAL_RAM
@@ -134,6 +156,7 @@ SECTIONS
 		*divsi3.o(.text  .rodata)   *udivsi3.o(.text .rodata)
 		*umodsi3.o(.text .rodata)  _*.o(.text  .rodata)
 	} > EXTERNAL_DRAM_0
+
     EBSP_TEXT            . : { } > EXTERNAL_DRAM_0
     EBSP_RO              . : { } > EXTERNAL_DRAM_0
 
@@ -353,10 +376,13 @@ SECTIONS
 
   /**/
   PROVIDE (__stack_start_ = ORIGIN(INTERNAL_RAM) + LENGTH(INTERNAL_RAM) - 0x10);
-  .stack __stack_start_ : { ___stack = .; *(.stack) }
-  PROVIDE (___heap_start = ORIGIN(EXTERNAL_DRAM_1) + __HEAP_SIZE_FOR_CORE_ * __CORE_NUM_);
+  .stack __stack_start_ : { __stack = .; *(.stack) }
+  PROVIDE (___stack = __stack);
+  PROVIDE (__heap_start = ORIGIN(EXTERNAL_DRAM_1) + __HEAP_SIZE_FOR_CORE_ * __CORE_NUM_);
+  PROVIDE (___heap_start = __heap_start);
   /*.heap_start __heap_start_ :  { _heap_start_ = .; *(.heap_start) } */
-  PROVIDE (___heap_end   = ORIGIN(EXTERNAL_DRAM_1) + __HEAP_SIZE_FOR_CORE_ * __CORE_NUM_ + __HEAP_SIZE_FOR_CORE_ - 4);
+  PROVIDE (__heap_end   = ORIGIN(EXTERNAL_DRAM_1) + __HEAP_SIZE_FOR_CORE_ * __CORE_NUM_ + __HEAP_SIZE_FOR_CORE_ - 4);
+  PROVIDE (___heap_end = __heap_end);
   /* .heap_end __heap_end_ : { _heap_end_ = .; *(.heap_end) } */
 
   /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }